From 67da0f6b9046ef9b643878e8c521cc3e0eed1137 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Fri, 21 Jun 2024 18:24:38 -0500 Subject: [PATCH 01/88] chore: reset oasysdb to barebone project --- .env.example | 2 - .github/workflows/deploy-python.yml | 136 ---- .github/workflows/deploy-rust.yml | 26 - .github/workflows/quality-check.yml | 106 --- .gitignore | 3 - bench/main.rs | 109 --- bench/utils.rs | 20 - docs/guide.md | 144 ---- docs/index.md | 55 -- examples/extract-metadata.rs | 17 - examples/measure-memory.rs | 24 - examples/measure-recall.rs | 87 --- examples/quickstart.rs | 30 - mkdocs.yml | 1 - py/example.py | 27 - py/oasysdb/__init__.py | 2 - py/oasysdb/collection.pyi | 237 ------ py/oasysdb/database.pyi | 59 -- py/oasysdb/prelude.pyi | 5 - py/oasysdb/py.typed | 0 py/oasysdb/vector.pyi | 31 - py/tests/test_collection.py | 248 ------- py/tests/test_database.py | 69 -- py/tests/test_vector.py | 19 - pyproject.toml | 24 - readme.md | 202 ------ requirements.txt | 5 - src/db/database.rs | 279 -------- src/db/mod.rs | 16 - src/func/collection.rs | 1029 --------------------------- src/func/distance.rs | 63 -- src/func/err.rs | 160 ----- src/func/filter.rs | 419 ----------- src/func/metadata.rs | 259 ------- src/func/mod.rs | 43 -- src/func/utils.rs | 457 ------------ src/func/vector.rs | 164 ----- src/lib.rs | 94 --- src/main.rs | 3 + src/prelude/mod.rs | 7 - src/tests/mod.rs | 46 -- src/tests/test_collection.rs | 171 ----- src/tests/test_database.rs | 67 -- src/tests/test_distance.rs | 17 - src/tests/test_filter.rs | 151 ---- src/tests/test_metadata.rs | 49 -- src/tests/test_vectorgen.rs | 30 - src/vectorgen/mod.rs | 29 - src/vectorgen/openai.rs | 91 --- 49 files changed, 3 insertions(+), 5329 deletions(-) delete mode 100644 .env.example delete mode 100644 .github/workflows/deploy-python.yml delete mode 100644 .github/workflows/deploy-rust.yml delete mode 100644 .github/workflows/quality-check.yml delete mode 100644 bench/main.rs delete mode 100644 bench/utils.rs delete mode 100644 docs/guide.md delete mode 100644 examples/extract-metadata.rs delete mode 100644 examples/measure-memory.rs delete mode 100644 examples/measure-recall.rs delete mode 100644 examples/quickstart.rs delete mode 100644 py/example.py delete mode 100644 py/oasysdb/__init__.py delete mode 100644 py/oasysdb/collection.pyi delete mode 100644 py/oasysdb/database.pyi delete mode 100644 py/oasysdb/prelude.pyi delete mode 100644 py/oasysdb/py.typed delete mode 100644 py/oasysdb/vector.pyi delete mode 100644 py/tests/test_collection.py delete mode 100644 py/tests/test_database.py delete mode 100644 py/tests/test_vector.py delete mode 100644 pyproject.toml delete mode 100644 src/db/database.rs delete mode 100644 src/db/mod.rs delete mode 100644 src/func/collection.rs delete mode 100644 src/func/distance.rs delete mode 100644 src/func/err.rs delete mode 100644 src/func/filter.rs delete mode 100644 src/func/metadata.rs delete mode 100644 src/func/mod.rs delete mode 100644 src/func/utils.rs delete mode 100644 src/func/vector.rs delete mode 100644 src/lib.rs delete mode 100644 src/prelude/mod.rs delete mode 100644 src/tests/mod.rs delete mode 100644 src/tests/test_collection.rs delete mode 100644 src/tests/test_database.rs delete mode 100644 src/tests/test_distance.rs delete mode 100644 src/tests/test_filter.rs delete mode 100644 src/tests/test_metadata.rs delete mode 100644 src/tests/test_vectorgen.rs delete mode 100644 src/vectorgen/mod.rs delete mode 100644 src/vectorgen/openai.rs diff --git a/.env.example b/.env.example deleted file mode 100644 index 7df8c88b..00000000 --- a/.env.example +++ /dev/null @@ -1,2 +0,0 @@ -# To test OpenAI functionality. -OPENAI_API_KEY=xxx diff --git a/.github/workflows/deploy-python.yml b/.github/workflows/deploy-python.yml deleted file mode 100644 index 8ce4fa94..00000000 --- a/.github/workflows/deploy-python.yml +++ /dev/null @@ -1,136 +0,0 @@ -name: Deploy as a Python package - -on: - workflow_dispatch: - push: - tags: - - "*" - -permissions: - contents: read - -jobs: - build-linux: - name: Build wheels on Linux - runs-on: ubuntu-latest - strategy: - matrix: - target: [x86_64, x86, aarch64] - steps: - - name: Check out code - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.target }} - args: --release --out dist --find-interpreter - sccache: "true" - manylinux: auto - - - name: Upload wheels - uses: actions/upload-artifact@v3 - with: - name: wheels - path: dist - - build-windows: - name: Build wheels on Windows - runs-on: windows-latest - strategy: - matrix: - target: [x64, x86] - steps: - - name: Check out code - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.10" - architecture: ${{ matrix.target }} - - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.target }} - args: --release --out dist --find-interpreter - sccache: "true" - - - name: Upload wheels - uses: actions/upload-artifact@v3 - with: - name: wheels - path: dist - - build-macos: - name: Build wheels on MacOS - runs-on: macos-latest - strategy: - matrix: - target: [x86_64, aarch64] - steps: - - name: Check out code - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.target }} - args: --release --out dist --find-interpreter - sccache: "true" - - - name: Upload wheels - uses: actions/upload-artifact@v3 - with: - name: wheels - path: dist - - build-sdist: - name: Build source distribution - runs-on: ubuntu-latest - steps: - - name: Check out code - uses: actions/checkout@v3 - - - name: Build sdist - uses: PyO3/maturin-action@v1 - with: - command: sdist - args: --out dist - - - name: Upload sdist - uses: actions/upload-artifact@v3 - with: - name: wheels - path: dist - - release: - name: Release to PyPI - runs-on: ubuntu-latest - needs: [build-linux, build-windows, build-macos, build-sdist] - environment: - name: python library - url: https://pypi.org/project/oasysdb - steps: - - name: Download wheels - uses: actions/download-artifact@v3 - with: - name: wheels - - name: Publish to PyPI - uses: PyO3/maturin-action@v1 - env: - MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} - with: - command: upload - args: --non-interactive --skip-existing * diff --git a/.github/workflows/deploy-rust.yml b/.github/workflows/deploy-rust.yml deleted file mode 100644 index 1fe22f83..00000000 --- a/.github/workflows/deploy-rust.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: Publish as Rust crate - -on: - workflow_dispatch: - push: - tags: - - "*" - -jobs: - publish-crate: - name: Publish to Crates.io - runs-on: ubuntu-latest - environment: - name: rust crate - url: https://crates.io/crates/oasysdb - steps: - - name: Checkout the code - uses: actions/checkout@v4 - - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@stable - - - name: Publish to Crates.io - run: cargo publish --token ${CRATES_TOKEN} - env: - CRATES_TOKEN: ${{ secrets.CRATES_TOKEN }} diff --git a/.github/workflows/quality-check.yml b/.github/workflows/quality-check.yml deleted file mode 100644 index 00d2ebd9..00000000 --- a/.github/workflows/quality-check.yml +++ /dev/null @@ -1,106 +0,0 @@ -name: Quality check and testing - -# Summary of the workflow -# For every push to the main branch or any pull request: -# 1. Run cargo fmt: Check code formatting to ensure consistency. -# 2. Run cargo clippy: Check linting to improve code quality. -# 3. Run cargo test: Run all Rust-based tests. -# 4. Run pytest: Build wheels and run Python tests. - -on: - workflow_dispatch: - - pull_request: - paths-ignore: - - "docs/**" - - push: - branches: - - main - paths-ignore: - - "docs/**" - -env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - -jobs: - rustfmt-format: - name: Check code formatting - runs-on: ubuntu-latest - steps: - - name: Checkout the code - uses: actions/checkout@v4 - - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@stable - with: - components: rustfmt - - - name: Run cargo fmt with check - run: cargo fmt -- --check - - clippy-lint: - name: Lint code with Clippy - runs-on: ubuntu-latest - steps: - - name: Checkout the code - uses: actions/checkout@v4 - - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@stable - with: - components: clippy - - - name: Run cargo clippy - run: cargo clippy -- -D warnings - - run-rust-tests: - name: Run Rust tests - needs: clippy-lint - runs-on: ubuntu-latest - steps: - - name: Checkout the code - uses: actions/checkout@v4 - - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@stable - - - name: Run cargo test - run: cargo test --all-features -- --test-threads 1 - - run-python-tests: - name: Run Python tests - needs: run-rust-tests - runs-on: ubuntu-latest - steps: - - name: Checkout the code - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.10" - cache: "pip" - - - name: Install requirements - run: | - pip install --upgrade pip - python -m venv .venv - - - name: Build development wheels - uses: PyO3/maturin-action@v1 - with: - command: develop - sccache: "true" - - - name: Run tests - run: | - source .venv/bin/activate - pip install -r requirements.txt - python -m pytest - - - name: Lint code with Flake8 - run: | - source .venv/bin/activate - pip install -r requirements.txt - python -m flake8 diff --git a/.gitignore b/.gitignore index 618d6b68..738f868d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,3 @@ -# OasysDB. -data - # Rust stuff. debug target diff --git a/bench/main.rs b/bench/main.rs deleted file mode 100644 index ce13f53f..00000000 --- a/bench/main.rs +++ /dev/null @@ -1,109 +0,0 @@ -// See measure-memory.rs example for memory usage. - -mod utils; - -use criterion::*; -use oasysdb::prelude::*; -use utils::*; - -/// The number of vector records in the collection. -const COLLECTION_SIZE: usize = 1_000_000; - -/// The vector embedding dimension. -/// A vector dimension of 768, 1024, or 4096 are very common options -/// for models on [MTEB](https://huggingface.co/spaces/mteb/leaderboard). -const DIMENSION: usize = 128; - -fn bench_search_collection(criterion: &mut Criterion) { - let id = "search collection"; - - // Create the collection. - let collection = build_test_collection(DIMENSION, COLLECTION_SIZE); - - // Create a random vector to search for. - let vector = Vector::random(DIMENSION); - - // Benchmark the search speed. - let routine = || { - black_box(collection.search(&vector, 10).unwrap()); - }; - - criterion.bench_function(id, |b| b.iter(routine)); -} - -fn bench_true_search_collection(criterion: &mut Criterion) { - let id = "true search collection"; - - // Create the collection. - let collection = build_test_collection(DIMENSION, COLLECTION_SIZE); - - // Create a random vector to search for. - let vector = Vector::random(DIMENSION); - - // Benchmark the search speed. - let routine = || { - black_box(collection.true_search(&vector, 10).unwrap()); - }; - - criterion.bench_function(id, |b| b.iter(routine)); -} - -fn bench_insert_to_collection(criterion: &mut Criterion) { - let id = "insert to collection"; - - // Create the initial collection. - let mut collection = build_test_collection(DIMENSION, COLLECTION_SIZE); - - // Benchmark the insert speed. - let record = Record::random(DIMENSION); - criterion.bench_function(id, |bencher| { - bencher.iter(|| { - black_box(collection.insert(&record).unwrap()); - }) - }); -} - -criterion_group!( - collection, - bench_search_collection, - bench_true_search_collection, - bench_insert_to_collection -); - -fn bench_save_collection_to_database(criterion: &mut Criterion) { - let id = "save collection to database"; - - // Setup the database and collection. - let collection = build_test_collection(DIMENSION, COLLECTION_SIZE); - let mut db = create_test_database(DIMENSION, COLLECTION_SIZE); - - // Benchmark the save speed. - criterion.bench_function(id, |bencher| { - bencher.iter(|| { - black_box(db.save_collection("bench", &collection).unwrap()); - }) - }); -} - -fn bench_get_collection_from_database(criterion: &mut Criterion) { - let id = "get collection from database"; - let db = create_test_database(DIMENSION, COLLECTION_SIZE); - - // Benchmark the get speed. - // This is the operation that loads the collection into memory. - let routine = || { - black_box(db.get_collection("bench").unwrap()); - }; - - criterion.bench_function(id, |b| b.iter(routine)); -} - -criterion_group! { - name = database; - config = Criterion::default().sample_size(10); - targets = - bench_save_collection_to_database, - bench_get_collection_from_database -} - -criterion_main!(collection, database); diff --git a/bench/utils.rs b/bench/utils.rs deleted file mode 100644 index bca304bc..00000000 --- a/bench/utils.rs +++ /dev/null @@ -1,20 +0,0 @@ -use oasysdb::prelude::*; - -/// Creates a collection with random vector records. -/// * `dimension`: Dimensionality of the vector embeddings -/// * `len`: Number of records in the database -pub fn build_test_collection(dimension: usize, len: usize) -> Collection { - let records = Record::many_random(dimension, len); - let config = Config::default(); - Collection::build(&config, &records).unwrap() -} - -/// Creates a pre-populated database with a collection for testing. -/// * `dimension`: Dimensionality of the vector embeddings -/// * `size`: Number of records in the collection -pub fn create_test_database(dimension: usize, size: usize) -> Database { - let collection = build_test_collection(dimension, size); - let mut db = Database::new("data/bench").unwrap(); - db.save_collection("bench", &collection).unwrap(); - db -} diff --git a/docs/guide.md b/docs/guide.md deleted file mode 100644 index d28586f0..00000000 --- a/docs/guide.md +++ /dev/null @@ -1,144 +0,0 @@ -# Comprehensive Guide - -If you are reading this guide, that means you are curious about how OasysDB is designed, why it is designed that way, and how to use it for your project. - -Thank you and welcome 🤗 - -My biggest goal for OasysDB is to make it **boring**. Not boring in a bad way, but, boring in a way to is predictable, easy to use, with no surprises. I want to make it so that you can use and rely on OasysDB without having to worry about it. - -For that, I made some quite opinionated design decisions that I believe will help OasysDB achieve that goal. In this guide, I will explain those decisions and how they affect the usage of OasysDB. - -### Table of Contents - -- [Inner Workings](#inner-workings) - - [Vector Record](#vector-record) - - [Vector ID: Auto Incremented](#vector-id-auto-incremented) - - [Persistence to Disk](#persistence-to-disk) - - [Notes & Tips](#notes--tips) -- [Indexing Algorithm](#indexing-algorithm) - - [Intro to HNSW](#intro-to-hnsw) - - [Index Configuration](#index-configuration) - - [Distance Metric](#distance-metric) - - [Relevancy Score](#relevancy-score) -- [Conclusion](#conclusion) - - [Relevant Resources](#relevant-resources) - -# Inner Workings - -You can think of OasysDB as a NoSQL database optimized for vector operations because of how the data is indexed. Instead of using a traditional B-Tree or LSM-Tree, OasysDB uses [HNSW](#indexing-algorithm) as its vector indexing algorithm to index the data in the form of graphs. - -Besides that, OasysDB shares similar concept with traditional NoSQL databases. It stores data in collections, where each collection contains multiple records. - -## Vector Record - -When you want to store a vector in OasysDB, you will insert vector record objects. This object contains the vector itself and some metadata. The metadata object can be used to store any information you need to associate with the vector. - -**Metadata types:** - -- Text -- Number -- Boolean -- Array -- Object - -## Vector ID: Auto Incremented - -When you insert a vector record, OasysDB will automatically assign an integer ID to the record that is auto-incremented with every inserts. This ID is unique within the collection and will be used to reference the vector record. - -I made this decision to make the indexing algorithm more efficient and performant. Compared to UUID which is 128-bit or string ID which can be any length, an integer U32 ID is only 32-bit. This means that the indexing algorithm can work with smaller and more predictable data size. - -**The 2 downsides of this decision are:** - -- You cannot specify the ID when inserting a vector record. -- A collection is limited to store around 4 billion records. - -## Persistence to Disk - -By default, due to the nature of the vector indexing algorithm, OasysDB stores the vector record data in memory via the collection interface. This means that unless persisted to disk via the database save collection method, the data will be lost when the program is closed. - -Under the hood, OasysDB serializes the collection to bytes using [Serde](https://github.com/serde-rs/serde) and writes it to a file. The reference to the file is then saved, along with other details, to the database powered by [Sled](https://github.com/spacejam/sled). Because of this, **whenever you modify a collection, you need to save the collection back to the database to persist the changes to disk.** - -When opening the database, OasysDB doesn't automatically load the collections from the database file into memory as this would be inefficient if you have many collections you don't necessarily use all the time. Instead, you need to load the collections you want to use into memory manually using the get collection method. - -### Notes & Tips - -The serialization and the deserialization process is compute-intensive and can be rather slow. This is why to optimize the performance of your application, you should follow these tips: - -- Save the collection to disk only when you're totally done with modifying it. -- Load only the collection you need into the memory as they could take up a good chunk of memory. -- If you use a collection for multiple processes, consider keeping it in-memory as a global state to avoid reloading it. - -If you have any questions or need help with optimizing the performance of your application, feel free to ask me on the [Discord](https://discord.gg/bDhQrkqNP4) server. - -I'm always happy to help you out 🤗 - -# Indexing Algorithm - -This is arguably the most important part of OasysDB ⭐️ - -The indexing algorithm is what makes OasysDB a vector database and what allows you to perform fast similarity searches on your vectors records. - -OasysDB uses the HNSW (Hierarchical Navigable Small World) algorithm. We're not going to dive deep into the algorithm in this guide, but, I will explain how it works in the context of OasysDB. - -## Intro to HNSW - -HNSW is a graph-based indexing algorithm. It consists of multiple layers containing nodes referencing other nodes (neighbors). These nodes represent the vector IDs of the records in the collection. - -When you insert vector records into a collection, OasysDB will: - -1. Generate vector IDs for the records. -2. Calculate distances between the new and existing vectors. -3. Place nodes and cluster them based on their similarity in the layers. -4. Store the other data in HashMaps for fast access. - -Because OasysDB stores the vector IDs in the index graph as nodes, having an auto-incremented integer as the vector ID is important for memory efficiency and performance. - -## Index Configuration - -OasysDB allows you to configure the index parameters when creating a collection. As of the current version, these configurations can't be changed after the collection is created. These configurations include: - -- **M**: The maximum number of neighbor connections to keep for each node when building the index or inserting a new vector record. OasysDB uses M of 32 by default and this value works well for most use cases. As of the current version, you can't change this value at all. - -- **EF Construction**: This parameter along with the M parameter determines how well the index will be constructed. The higher the EF Construction value, the slower the index construction will be, but, the more accurate the index will be up to a certain point. - - According to [HNSWLIB's documentation](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md), to check if EF Construction value is good enough is to measure the recall for the search operation with k = M and EF Search = EF Construction. If the recall is lower than 0.9, than there is room for improvement. - -- **EF Search**: This parameter determines how many nodes to visit when searching for the nearest neighbors. The higher the EF Search value, the more accurate the search result will be, but, the slower the search will be. - - EF Search value should be set to a value higher than k (the number of neighbors you want to find) when performing a search operation. - -- **ML**: This parameter determines how likely it is for a node to be placed in the higher layer. This multiplier is what allows HNSW to be the most dense at the bottom and the least dense at the top keeping the search operation efficient. The optimal value for ML is 1 / ln(M). In OasysDB, this would be around 0.2885. - -OasysDB has more parameters that you can configure but not directly related to the index configuration. For those parameters, we will discuss it in the next section 😁 - -## Distance Metric - -For collections in OasysDB, you can specify the distance metric to use when calculating the distance between vectors. The distance metric is used mostly when inserting a new vector record into the collection and a bit when performing a search operation. - -As of the current version, OasysDB supports the following distance metrics: - -- [Euclidean Distance](https://en.wikipedia.org/wiki/Euclidean_distance) -- [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance) - -## Relevancy Score - -Relevancy score is a big part of OasysDB. It allows you to essentially exclude vectors that are not relevant to your search query. Unlike other configurations, the relevancy score can be changed after the collection is created. I even encourage you to experiment with different relevancy scores to see what works best for your use case 😁 - -Relevancy score is a float value that acts as a threshold to filter out vectors which distance is further than the set relevancy score and consider only vectors that are closer to the query vector. - -For example, for the Euclidean distance metric, since the Euclidean distance value ranges from 0 to infinity, the closer the distance is to 0, the more similar the vectors are. If you were to set the relevancy score to 0.2, OasysDB will only return vectors that have a Euclidean distance of 0.2 or lower from the query vector. - -# Conclusion - -In short, use OasysDB to keep your sanity 😂 - -I hope this guide has given you a good understanding of how OasysDB works and how to use it for your project. If you have any questions or need help with anything Rust related, join the [Discord](https://discord.gg/bDhQrkqNP4) server and share them with me. - -~ Edwin - -### Relevant Resources - -- [HNSW by Pinecone](https://www.pinecone.io/learn/series/faiss/hnsw/) -- [HNSW Algorithm by Lantern](https://lantern.dev/blog/hnsw) -- [What Are Vector Embeddings?](https://www.analyticsvidhya.com/blog/2020/08/information-retrieval-using-embeddings/) -- [OpenAI Embeddings](https://platform.openai.com/docs/guides/embeddings/frequently-asked-questions) diff --git a/docs/index.md b/docs/index.md index 408496e2..e69de29b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,55 +0,0 @@ -# Introducing OasysDB - -![OasysDB Use Case](https://i.postimg.cc/k4x4Q55k/banner.png) - -OasysDB is a **flexible**, **performant**, and **easy to use** vector database for storing and searching high-dimensional vectors. OasysDB is built in Rust but provides a Python binding for you to use in your Python projects. - -The goal of OasysDB is to be a vector database with a great developer experience throughout the lifecycle of the project; from early development to production. In addition to an easy-to-use API, OasysDB can be used in 3 different ways: - -
- -- :fontawesome-solid-terminal: **Embedded**
- Run OasysDB directly inside your application. - -- :fontawesome-solid-server: **Hosted**
- Run OasysDB as a standalone server.
- _Coming soon_ - -- :fontawesome-solid-circle-nodes: **Distributed**
- Run sharded OasysDB instances.
- _Coming not so soon_ 😉 - -
- -## OasysDB as a Vector Database - -You can think of OasysDB as a NoSQL/document database like MongoDB or CouchDB, but purposefully built for indexing high-dimensional vectors. Instead of using a traditional index like B-Tree or LSM-Tree, OasysDB uses **HNSW** as its indexing algorithm to index the data in graphs so that it can perform fast similarity searches on the vectors. - -OasysDB shares a lot of concepts with traditional NoSQL databases. It stores data in collections and multiple collections can be stored in a database. The API of the collection is also very similar to a document store where you have methods to insert, get, update, and delete records with the additional search method to find similar vectors AKA nearest neighbors. - -## Vector Record - -When you want to store a vector in OasysDB, you will insert vector record objects. This object contains the vector embedding itself and some additional metadata. - -### Embedding - -OasysDB is optimized for high-dimensional vectors of any dimensionality. You can store 2D or 3D vectors/points, but, the real power of OasysDB shines when you store high-dimensional vectors like 128D, 768D, or even 4096D vectors. - -As per the benchmark, OasysDB can perform similarity searches on 1 million 4096D vectors in single-digit milliseconds which makes it a great choice for building real-time search or semantic caching systems. - -- M2 Macbook Pro with 16GB memory: 9.11ms -- M3 Macbook Pro with 128GB memory: 3.87ms - -### Metadata - -When you store a vector in OasysDB, you can insert a metadata object along with the vector. The metadata is a JSON-like object that can store any information you need to associate with the vector. For example, for a use case like an image search system, you can store the image URL, title, and description of the image in the metadata object. - -```json -{ - "url": "https://www.example.com/image.jpg", - "title": "Beautiful Sunset", - ... -} -``` - -When performing a search, the metadata will be returned along with additional information. In most cases, you will use the metadata to display the search results to the user or in case of RAG application, you can use the metadata to generate a relevant answer. diff --git a/examples/extract-metadata.rs b/examples/extract-metadata.rs deleted file mode 100644 index fa3ba723..00000000 --- a/examples/extract-metadata.rs +++ /dev/null @@ -1,17 +0,0 @@ -use oasysdb::prelude::*; - -fn main() { - // Inserting a metadata value into a record. - let data: &str = "This is an example."; - let vector = Vector::random(128); - let record = Record::new(&vector, &data.into()); - - // Extracting the metadata value. - let metadata = record.data.clone(); - let data = match metadata { - Metadata::Text(value) => value, - _ => panic!("Data is not a text."), - }; - - println!("{}", data); -} diff --git a/examples/measure-memory.rs b/examples/measure-memory.rs deleted file mode 100644 index 7eed20c7..00000000 --- a/examples/measure-memory.rs +++ /dev/null @@ -1,24 +0,0 @@ -use jemalloc_ctl::stats::allocated; -use jemallocator::Jemalloc; -use oasysdb::collection::*; - -#[global_allocator] -static ALLOC: Jemalloc = Jemalloc; - -fn main() { - // Matches the configuration in bench/main.rs. - let len = 1_000_000; - let dimension = 128; - - // Build the vector collection. - let records = Record::many_random(dimension, len); - let config = Config::default(); - Collection::build(&config, &records).unwrap(); - - // Measure the memory usage. - let memory = allocated::read().unwrap(); - let size_mb = memory as f32 / (1024.0 * 1024.0); - - println!("For {} vector records of dimension {}", len, dimension); - println!("Memory usage: {:.0}MB", size_mb); -} diff --git a/examples/measure-recall.rs b/examples/measure-recall.rs deleted file mode 100644 index bdaaf56c..00000000 --- a/examples/measure-recall.rs +++ /dev/null @@ -1,87 +0,0 @@ -// Note: This example measures the recall rate of the HNSW index. -// This might not reflect the actual performance of the index, as the -// recall rate is highly dependent on the quality of the data and the -// query distribution. - -use oasysdb::prelude::*; -use rand::random; - -// High-level collection configuration. -const DIMENSION: usize = 1536; -const COLLECTION_SIZE: usize = 1000; - -// HNSW configuration. -const EF_CONSTRUCTION: usize = 128; -const EF_SEARCH: usize = 64; -const ML: f32 = 0.2885; -const DISTANCE: &str = "euclidean"; - -// Query configuration. -const N_QUERIES: usize = 100; -const K: usize = 10; -const WITH_FILTERS: bool = false; - -fn main() { - // Build a collection. - let records = Record::many_random(DIMENSION, COLLECTION_SIZE); - let config = Config::new(EF_CONSTRUCTION, EF_SEARCH, ML, DISTANCE).unwrap(); - let collection = Collection::build(&config, &records).unwrap(); - - // Query the collection. - let mut results = Vec::new(); - let mut true_results = Vec::new(); - - // Generate random filters. - let random_int = random::(); - let filters = Filters::from(format!("integer < {random_int}")); - - for _ in 0..N_QUERIES { - let query = Vector::random(DIMENSION); - - let (result, true_result) = if WITH_FILTERS { - search_with_filters(&query, &filters, &collection) - } else { - search(&query, &collection) - }; - - results.push(result); - true_results.push(true_result); - } - - // Measure recall. - let mut correct = 0; - for _ in 0..N_QUERIES { - let result = results.pop().unwrap(); - let true_result = true_results.pop().unwrap(); - - for r in result.iter() { - if true_result.contains(r) { - correct += 1; - } - } - } - - let recall = (100 * correct) as f64 / (N_QUERIES * K) as f64; - println!("Recall Rate: {recall:.2}%"); -} - -fn search( - query: &Vector, - collection: &Collection, -) -> (Vec, Vec) { - ( - collection.search(query, K).unwrap(), - collection.true_search(query, K).unwrap(), - ) -} - -fn search_with_filters( - query: &Vector, - filters: &Filters, - collection: &Collection, -) -> (Vec, Vec) { - ( - collection.search_with_filters(query, K, filters).unwrap(), - collection.true_search_with_filters(query, K, filters).unwrap(), - ) -} diff --git a/examples/quickstart.rs b/examples/quickstart.rs deleted file mode 100644 index 8ef07456..00000000 --- a/examples/quickstart.rs +++ /dev/null @@ -1,30 +0,0 @@ -use oasysdb::prelude::*; - -fn main() { - // Vector dimension must be uniform. - let dimension = 128; - - // Replace with your own data. - let records = Record::many_random(dimension, 100); - - let mut config = Config::default(); - - // Optionally set the distance function. Default to Euclidean. - config.distance = Distance::Cosine; - - // Create a vector collection. - let collection = Collection::build(&config, &records).unwrap(); - - // Optionally save the collection to persist it. - let mut db = Database::new("data/test").unwrap(); - db.save_collection("vectors", &collection).unwrap(); - - // Search for the nearest neighbors. - let query = Vector::random(dimension); - let result = collection.search(&query, 5).unwrap(); - - for res in result { - let (id, distance) = (res.id, res.distance); - println!("{distance:.5} | ID: {id}"); - } -} diff --git a/mkdocs.yml b/mkdocs.yml index 1b02e364..1c9b675c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -85,6 +85,5 @@ markdown_extensions: permalink: "#" exclude_docs: | - guide.md pull_request_template.md security.md diff --git a/py/example.py b/py/example.py deleted file mode 100644 index 5a30e5cb..00000000 --- a/py/example.py +++ /dev/null @@ -1,27 +0,0 @@ -# flake8: noqa F403 - -from oasysdb.prelude import * - - -if __name__ == "__main__": - # Open the database. - db = Database("data/example") - - # Replace with your own records. - records = Record.many_random(dimension=128, len=100) - - # Create a vector collection. - config = Config.create_default() - collection = Collection.from_records(config, records) - - # Optionally, persist the collection to the database. - db.save_collection("my_collection", collection) - - # Replace with your own query. - query = Vector.random(128) - - # Search for the nearest neighbors. - result = collection.search(query, n=5) - - # Print the result. - print("Nearest neighbors ID: {}".format(result[0].id)) diff --git a/py/oasysdb/__init__.py b/py/oasysdb/__init__.py deleted file mode 100644 index a853c47c..00000000 --- a/py/oasysdb/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# flake8: noqa F401 -from .oasysdb import * diff --git a/py/oasysdb/collection.pyi b/py/oasysdb/collection.pyi deleted file mode 100644 index af14570e..00000000 --- a/py/oasysdb/collection.pyi +++ /dev/null @@ -1,237 +0,0 @@ -# flake8: noqa F821 - -from typing import Any, List, Dict -from oasysdb.vector import Vector, VectorID - - -class Config: - """The configuration for the vector collection. - - Args: - - ef_construction: Nodes to consider during index construction. - - ef_search: Nodes to consider during the search. - - ml: Layer multiplier of the HNSW index. - - distance: Distance metric function. - - Distance metrics: - - euclidean - - cosine - """ - - ef_construction: int - ef_search: int - ml: float - distance: str - - def __init__( - self, - ef_construction: int, - ef_search: int, - ml: float, - distance: str - ) -> None: ... - - @staticmethod - def create_default() -> Config: - """Returns a default configuration. - - Default values: - - ef_construction: 40 - - ef_search: 15 - - ml: 0.2885 - - distance: euclidean - """ - - @staticmethod - def default() -> Config: - """Returns a default configuration object. - This is an alias of create_default method and - shared the same implementation. - """ - - -class Record: - """The vector record to store in the collection. - - Args: - - vector: Vector embedding of float values. - - data: Metadata of the vector. - - Metadata types: - - String - - Number - - Boolean - - List of metadata types - - Dictionary of metadata types - """ - - vector: Vector - data: Any - - def __init__(self, vector: List[float], data: Any) -> None: ... - - @staticmethod - def random(dimension: int) -> Record: - """Generates a random record with the given dimension - with a random integer metadata. - - Args: - - dimension: Vector dimension. - """ - - @staticmethod - def many_random(dimension: int, len: int) -> List[Record]: - """Generates a list of random records. - - Args: - - dimension: Vector dimension. - - len: Number of records. - """ - - -class Collection: - """The collection of vectors and their metadata.""" - - config: Config - dimension: int - relevancy: float - - def __init__(self, config: Config) -> None: ... - - @staticmethod - def from_records(config: Config, records: List[Record]) -> Collection: - """Build a collection from the given records. - - Args: - - config: Collection configuration. - - records: Records used to build the collection. - """ - - @staticmethod - def build(config: Config, records: List[Record]) -> Collection: - """Build a collection from the records. - This is an alias of from_records method and shared - the same implementation. - - Args: - - config: Collection configuration. - - records: Records used to build the collection. - """ - - def rebuild(self) -> Collection: - """Rebuilds the collection with the current available records.""" - - def insert(self, record: Record) -> VectorID: - """Inserts a record into the collection. - - Args: - - record: Record to insert. - """ - - def insert_many(self, records: List[Record]) -> List[VectorID]: - """Inserts multiple records into the collection. - - Args: - - records: Records to insert. - """ - - def delete(self, id: VectorID) -> None: - """Deletes a record from the collection. - - Args: - - id: Vector ID to delete. - """ - - def get(self, id: VectorID) -> Record: - """Returns a record from the collection. - - Args: - - id: Vector ID to fetch. - """ - - def list(self) -> Dict[VectorID, Record]: - """Returns a dictionary of records in the collection.""" - - def filter(self, filter: Any) -> Dict[VectorID, Record]: - """Returns a dictionary of records that match the filter. - - Args: - - filter: Metadata value to filter. - - Supported filter types: - - String: Includes the filter string. - - Number (int or float): Equal to the filter number. - - Dictionary: Matches all key-value pairs in the filter dictionary. - """ - - def update(self, id: VectorID, record: Record) -> None: - """Updates a record in the collection. - - Args: - - id: Vector ID to update. - - record: New record. - """ - - def search(self, vector: Vector, n: int) -> List[SearchResult]: - """Searches for the nearest neighbors to - the given vector using HNSW indexing algorithm - - Args: - - vector: Vector to search. - - n: Number of neighbors to return. - """ - - def search_with_filters( - self, - vector: Vector, - n: int, - filters: str - ) -> List[SearchResult]: - """Searches for the nearest neighbors to the given vector - with some SQL-like filters matching the metadata. - - Args: - - vector: Vector to search. - - n: Number of neighbors to return. - - filters: Metadata filters. - """ - - def true_search(self, vector: Vector, n: int) -> List[SearchResult]: - """Searches for the nearest neighbors using brute force. - - Args: - - vector: Vector to search. - - n: Number of neighbors to return. - """ - - def true_search_with_filters( - self, - vector: Vector, - n: int, - filters: str - ) -> List[SearchResult]: - """Searches for the nearest neighbors using linear search - with some SQL-like filters matching the metadata. - - Args: - - vector: Vector to search. - - n: Number of neighbors to return. - - filters: Metadata filters. - """ - - def len(self) -> int: - """Returns the number of records in the collection.""" - - def is_empty(self) -> bool: - """Returns True if the collection is empty.""" - - def contains(self, id: VectorID) -> bool: - """Returns True if the vector ID is in the collection.""" - - -class SearchResult: - """The result of a search operation on the collection.""" - - id: int - distance: float - data: Any diff --git a/py/oasysdb/database.pyi b/py/oasysdb/database.pyi deleted file mode 100644 index 5ec2e90b..00000000 --- a/py/oasysdb/database.pyi +++ /dev/null @@ -1,59 +0,0 @@ -# flake8: noqa F821 - -from oasysdb.collection import Collection, Record, Config - - -class Database: - """The persistent storage of vector collections. - - Args: - - path: Path to the database file. - """ - - def __init__(self, path: str,) -> None: ... - - def new(path: str) -> Database: - """Creates a new database at the given path. - This will reset the database if it exists. - - Args: - - path: Path to the database file. - """ - - def get_collection(self, name: str) -> Collection: - """Returns the collection with the given name. - - Args: - - name: Collection name. - """ - - def save_collection(self, name: str, collection: Collection) -> None: - """Saves new or update existing collection to the database. - - Args: - - name: Collection name. - - collection: Vector collection. - """ - - def delete_collection(self, name: str) -> None: - """Deletes the collection from the database. - - Args: - - name: Collection name. - """ - - def flush(self) -> int: - """Flushes dirty IO buffers and calls fsync. - - Returns: - Bytes flushed. - """ - - def async_flush(self) -> int: - """Asynchronously performs the flush operation.""" - - def len(self) -> int: - """Returns the number of collections in the database.""" - - def is_empty(self) -> bool: - """Returns True if the database is empty.""" diff --git a/py/oasysdb/prelude.pyi b/py/oasysdb/prelude.pyi deleted file mode 100644 index 78d063d7..00000000 --- a/py/oasysdb/prelude.pyi +++ /dev/null @@ -1,5 +0,0 @@ -# flake8: noqa F401 F403 - -from oasysdb.collection import * -from oasysdb.database import * -from oasysdb.vector import * diff --git a/py/oasysdb/py.typed b/py/oasysdb/py.typed deleted file mode 100644 index e69de29b..00000000 diff --git a/py/oasysdb/vector.pyi b/py/oasysdb/vector.pyi deleted file mode 100644 index cdb06867..00000000 --- a/py/oasysdb/vector.pyi +++ /dev/null @@ -1,31 +0,0 @@ -# flake8: noqa F821 - -from typing import List - - -class Vector: - """The vector embedding containing float values.""" - - def __init__(self, vector: List[float]) -> None: ... - - def len(self) -> int: - """Returns the length of the vector.""" - - def is_empty(self) -> bool: - """Returns True if the vector is empty.""" - - def to_list(self) -> List[float]: - """Returns the vector as a list of floats.""" - - @staticmethod - def random(dimension: int) -> Vector: - """Generates a random vector of the given dimension.""" - - -class VectorID: - """Identifier of the vector in the collection.""" - - def __init__(self, id: int) -> None: ... - - def is_valid(self) -> bool: - """Returns True if the vector ID is valid.""" diff --git a/py/tests/test_collection.py b/py/tests/test_collection.py deleted file mode 100644 index 0c720837..00000000 --- a/py/tests/test_collection.py +++ /dev/null @@ -1,248 +0,0 @@ -from oasysdb.prelude import Config, Record, Collection, Vector, VectorID - -DIMENSION = 128 -LEN = 100 - - -def create_test_collection() -> Collection: - """Creates a collection with random records for testing.""" - records = Record.many_random(dimension=DIMENSION, len=LEN) - config = Config.default() - collection = Collection.from_records(config=config, records=records) - - assert collection.len() == len(records) - return collection - - -def test_create_config(): - default = Config.create_default() - - # Create config based on the default. - config = Config( - ef_construction=128, - ef_search=64, - ml=0.2885, - distance="euclidean" - ) - - assert config.ef_construction == default.ef_construction - assert config.ef_search == default.ef_search - assert config.ml == default.ml - assert config.distance == default.distance - - -def test_create_record(): - vector = [0.1, 0.2, 0.3] - data = {"text": "This is an example."} - record = Record(vector=vector, data=data) - - assert len(record.vector) == len(vector) - assert record.data == data - - -def test_generate_random_record(): - record = Record.random(dimension=DIMENSION) - assert len(record.vector) == DIMENSION - assert isinstance(record.data, int) - - -def test_generate_many_random_records(): - records = Record.many_random(dimension=DIMENSION, len=LEN) - assert len(records) == LEN - - -def test_create_collection(): - config = Config.create_default() - collection = Collection(config=config) - - assert collection.config.ml == config.ml - assert collection.is_empty() - - -def test_build_collection(): - collection = create_test_collection() - assert collection.contains(VectorID(0)) - assert not collection.is_empty() - - -def test_insert_record(): - collection = create_test_collection() - record = Record.random(dimension=128) - collection.insert(record) - - assert collection.len() == LEN + 1 - assert collection.contains(VectorID(LEN)) - - -def test_insert_record_invalid_dimension(): - collection = create_test_collection() - record = Record.random(dimension=100) - - # Insert should raise an exception because the - # vector dimension is invalid. - try: - collection.insert(record) - assert False - except Exception as e: - assert "invalid vector dimension" in str(e).lower() - - assert collection.len() == LEN - - -def test_insert_many_records(): - collection = create_test_collection() - records = Record.many_random(dimension=DIMENSION, len=LEN) - collection.insert_many(records) - - assert collection.len() == 2 * LEN - assert all(collection.contains(VectorID(i)) for i in range(LEN, 2 * LEN)) - - -def test_delete_record(): - collection = create_test_collection() - - id = VectorID(0) - collection.delete(id) - - assert not collection.contains(id) - assert collection.len() == LEN - 1 - - -def test_get_record(): - collection = create_test_collection() - - id = VectorID(0) - record = collection.get(id) - - assert record is not None - assert record.data is not None - - -def test_update_record(): - collection = create_test_collection() - - id = VectorID(0) - record = Record.random(dimension=128) - collection.update(id, record) - - assert collection.contains(id) - assert collection.get(id).data == record.data - - -def test_search_record(): - collection = create_test_collection() - collection.relevancy = 4.5 - - vector = Vector.random(dimension=DIMENSION) - n = 10 - - # Search for approximate neighbors and true neighbors. - results = collection.search(vector, n=n) - true_results = collection.true_search(vector, n=n) - - # Make sure the first result of the approximate search - # is somewhere in the true results. - assert results[0].id in [true.id for true in true_results] - - # Check if the result distances are within the relevancy. - assert results[-1].distance <= collection.relevancy - assert true_results[-1].distance <= collection.relevancy - - -def test_set_dimension(): - config = Config.create_default() - collection = Collection(config=config) - - # Set the collection dimension to 100. - collection.dimension = 100 - - # When inserting a record with a different dimension, - # the collection should raise an exception. - try: - record = Record.random(dimension=128) - collection.insert(record) - assert False - except Exception as e: - assert "invalid vector dimension" in str(e).lower() - - -def test_list_records(): - collection = create_test_collection() - records = collection.list() - - assert len(records) == collection.len() - assert all(isinstance(k, VectorID) for k in records.keys()) - assert all(isinstance(v, Record) for v in records.values()) - - -def test_collection_distance_euclidean(): - config = Config.default() - collection = Collection(config=config) - - # Insert records. - k = 5 - records = Record.many_random(dimension=DIMENSION, len=k) - collection.insert_many(records) - - # Search for the record. - query = Vector.random(dimension=DIMENSION) - results = collection.search(query, n=k) - - # Sort result based on distance ascending. - sort = sorted(results, key=lambda x: x.distance) - - for i in range(k): - assert results[i].distance == sort[i].distance - - -def test_collection_distance_cosine(): - config = Config.create_default() - config.distance = "cosine" - collection = Collection(config=config) - - # Insert records. - k = 5 - records = Record.many_random(dimension=DIMENSION, len=k) - collection.insert_many(records) - - # Search for the record. - query = Vector.random(dimension=DIMENSION) - results = collection.search(query, n=k) - true_results = collection.true_search(query, n=k) - - for i in range(k): - assert results[i].distance == true_results[i].distance - - -def test_collection_filter_text(): - collection = create_test_collection() - - # Insert records with text data. - data = "OasysDB is awesome!" - vector = Vector.random(dimension=DIMENSION) - record = Record(vector=vector.to_list(), data=data) - id = Collection.insert(collection, record) - - # Search for the record using the text filter. - results = collection.filter("text CONTAINS awesome") - assert results.get(id).data == data - - -def test_collection_filter_object(): - collection = create_test_collection() - - # Sample object data. - data = { - "name": "Justin", - "age": 30, - "siblings": ["Kevin", "Luke"], - } - - # Insert records with object data. - vector = Vector.random(dimension=DIMENSION) - record = Record(vector=vector.to_list(), data=data) - id = Collection.insert(collection, record) - - # Filter records with a dict. - results = collection.filter("object.name = Justin") - assert results.get(id).data == data diff --git a/py/tests/test_database.py b/py/tests/test_database.py deleted file mode 100644 index 4bc8f36a..00000000 --- a/py/tests/test_database.py +++ /dev/null @@ -1,69 +0,0 @@ -import asyncio -from oasysdb.prelude import Record, Collection, Config, Database - - -NAME = "vectors" # Initial collection name. -DIMENSION = 128 -LEN = 100 - - -def create_test_database() -> Database: - """Creates a new test database with an initial collection.""" - - db = Database.new("data/py") - assert db.is_empty() - - # Create a test collection with random records. - records = Record.many_random(dimension=DIMENSION, len=LEN) - config = Config.create_default() - collection = Collection.from_records(config, records) - - # Save the collection to the database. - db.save_collection(name=NAME, collection=collection) - assert not db.is_empty() - - return db - - -def test_open(): - db = Database(path="data/mt") - assert db.is_empty() - - -def test_new(): - db = create_test_database() - assert not db.is_empty() - assert db.len() == 1 - - -def test_get_collection(): - db = create_test_database() - collection = db.get_collection(name=NAME) - assert collection.len() == LEN - - -def test_save_collection(): - db = create_test_database() - - # Create a new collection and save it to the database. - config = Config.create_default() - collection = Collection(config=config) - db.save_collection(name="test", collection=collection) - - assert db.len() == 2 - - -def test_delete_collection(): - db = create_test_database() - db.delete_collection(name=NAME) - assert db.is_empty() - - -def test_flush(): - db = create_test_database() - assert db.flush() > 0 - - -def test_async_flush(): - db = create_test_database() - assert asyncio.run(db.async_flush()) > 0 diff --git a/py/tests/test_vector.py b/py/tests/test_vector.py deleted file mode 100644 index 06594ad5..00000000 --- a/py/tests/test_vector.py +++ /dev/null @@ -1,19 +0,0 @@ -from oasysdb.prelude import Vector, VectorID - - -def test_create_vector(): - value = [0.1, 0.2, 0.3] - vector = Vector(value) - assert len(vector) == 3 - - -def test_generate_random_vector(): - dimension = 128 - vector = Vector.random(dimension) - assert len(vector) == dimension - - -def test_create_vector_id(): - id = 1 - vector_id = VectorID(id) - assert vector_id.is_valid() diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 4be170c6..00000000 --- a/pyproject.toml +++ /dev/null @@ -1,24 +0,0 @@ -[build-system] -requires = ["maturin>=1.4,<2.0"] -build-backend = "maturin" - -[project] -name = "oasysdb" -requires-python = ">=3.8" -dynamic = ["version"] -classifiers = [ - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Rust", - "Topic :: Database", - "Topic :: Text Processing :: Indexing", -] - -[project.urls] -repository = "https://github.com/oasysai/oasysdb" -issues = "https://github.com/oasysai/oasysdb/issues" -changelog = "https://github.com/oasysai/oasysdb/blob/main/docs/changelog.md" - -[tool.maturin] -python-source = "py" -features = ["pyo3/extension-module", "py"] diff --git a/readme.md b/readme.md index df7d39b6..e69de29b 100644 --- a/readme.md +++ b/readme.md @@ -1,202 +0,0 @@ -# 📣 Announcement - -Many thanks to everyone who supported OasysDB to progress this far. After thorough consideration, I have decided to pivot the project to a new direction. I will be focusing OasysDB to be a more robust vector database solution for production workload with ground-up support for hybrid ANN search algorithms. - -This will come with a lot of changes to the project structure, API, and functionality. I will be working on this in the coming weeks and months. I will keep you updated on the progress and the new direction of OasysDB. - -~ Edwin - -![OasysDB Use Case](https://i.postimg.cc/k4x4Q55k/banner.png) - -[![GitHub Stars](https://img.shields.io/github/stars/oasysai/oasysdb?style=for-the-badge&logo=github&logoColor=%23000000&labelColor=%23fcd34d&color=%236b7280)](https://github.com/oasysai/oasysdb) -[![Discord](https://img.shields.io/badge/chat-%236b7280?style=for-the-badge&logo=discord&logoColor=%23ffffff&label=discord&labelColor=%237289da)](https://discord.gg/bDhQrkqNP4) - -[![Crates.io](https://img.shields.io/crates/d/oasysdb?style=for-the-badge&logo=rust&logoColor=%23000&label=crates.io&labelColor=%23fdba74&color=%236b7280)](https://crates.io/crates/oasysdb) -[![PyPI](https://img.shields.io/pypi/dm/oasysdb?style=for-the-badge&label=PyPI&logo=python&logoColor=ffffff&labelColor=%230284c7&color=%236b7280)](https://pypi.org/project/oasysdb/) - -[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg?style=for-the-badge&labelColor=%2314b8a6&color=%236b7280)](https://opensource.org/licenses/Apache-2.0) -[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg?style=for-the-badge&labelColor=%2314b8a6&color=%236b7280)](/docs/code_of_conduct.md) - -# 👋 Meet OasysDB - -OasysDB is a flexible and easy-to-use vector database written in Rust. It is designed with simplicity in mind to help you focus on building your AI application without worrying about database setup and configuration. - -With 3 different runtime modes, OasysDB will accompany you throughout your journey from the early stages of development to scaling up your AI application for production workloads. - -- **Embedded**: Run OasysDB directly inside your application. -- **Hosted**: Run OasysDB as a standalone server. _Coming soon_ -- **Distributed**: Run sharded OasysDB instances. _Coming not so soon_ 😉 - -## Use Cases - -OasysDB is very flexible! You can use it for almost any systems related with vector search such as: - -- Local RAG (Retrieval-Augmented Generation) pipeline with an LLM and embedding model to generate a context-aware output. -- Image similarity search engine to find similar images based on their semantic content. [See Python demo](https://colab.research.google.com/drive/15_1hH7jGKzMeQ6IfnScjsc-iJRL5XyL7?usp=sharing). -- Real-time product recommendation system to suggest similar products based on the product features or user preferences. -- **Add your use case here** 😁 - -## Features - -### Core Features - -🔸 **Embedded Database**: Zero setup and no dedicated server or process required. - -🔸 **Optional Persistence**: In-memory vector collections that can be persisted to disk. - -🔸 **Incremental Ops**: Insert, modify, and delete vectors without rebuilding indexes. - -🔸 **Flexible Schema**: Store additional and flexible metadata for each vector record. - -### Technical Features - -🔹 **Fast HNSW**: Efficient and accurate vector search with state-of-the-art algorithm. - -🔹 **Configurable Metric**: Use Euclidean or Cosine distance depending on your use-case. - -🔹 **Parallel Processing**: Multi-threaded & SIMD-optimized vector distance calculation. - -🔹 **Built-in vector ID**: No headache record management with guaranteed ID uniqueness. - -# 🚀 Quickstart with Rust - -![Rust-Banner.png](https://i.postimg.cc/NMCwFBPd/Rust-Banner.png) - -To get started with OasysDB in Rust, you need to add `oasysdb` to your `Cargo.toml`. You can do so by running the command below which will add the latest version of OasysDB to your project. - -```bash -cargo add oasysdb -``` - -After that, you can use the code snippet below as a reference to get started with OasysDB. In short, use `Collection` to store your vector records or search similar vector and use `Database` to persist a vector collection to the disk. - -```rust -use oasysdb::prelude::*; - -// Vector dimension must be uniform. -let dimension = 128; - -// Replace with your own data. -let records = Record::many_random(dimension, 100); - -let mut config = Config::default(); - -// Optionally set the distance function. Default to Euclidean. -config.distance = Distance::Cosine; - -// Create a vector collection. -let collection = Collection::build(&config, &records).unwrap(); - -// Optionally save the collection to persist it. -let mut db = Database::new("data/test").unwrap(); -db.save_collection("vectors", &collection).unwrap(); - -// Search for the nearest neighbors. -let query = Vector::random(dimension); -let result = collection.search(&query, 5).unwrap(); - -for res in result { - let (id, distance) = (res.id, res.distance); - println!("{distance:.5} | ID: {id}"); -} -``` - -## Feature Flags - -OasysDB provides several feature flags to enable or disable certain features. You can do this by adding the feature flags to your project `Cargo.toml` file. Below are the available feature flags and their descriptions: - -- `json`: Enables easy Serde's JSON conversion from and to the metadata type. This feature is very useful if you have a complex metadata type or if you use APIs that communicate using JSON. - -- `gen`: Enables the vector generator trait and modules to extract vector embeddings from your contents using OpenAI or other embedding models. This feature allows OasysDB to handle vector embedding extraction for you without separate dependencies. - -# 🚀 Quickstart with Python - -![Python-Banner.png](https://i.postimg.cc/rp1qjBZJ/Python-Banner.png) - -OasysDB also provides a Python binding which allows you to add it directly to your project. You can install the Python library of OasysDB by running the command below: - -```bash -pip install oasysdb -``` - -This command will install the latest version of OasysDB to your Python environment. After you're all set with the installation, you can use the code snippet below as a reference to get started with OasysDB in Python. - -```python -from oasysdb.prelude import * - - -if __name__ == "__main__": - # Open the database. - db = Database("data/example") - - # Replace with your own records. - records = Record.many_random(dimension=128, len=100) - - # Create a vector collection. - config = Config.create_default() - collection = Collection.from_records(config, records) - - # Optionally, persist the collection to the database. - db.save_collection("my_collection", collection) - - # Replace with your own query. - query = Vector.random(128) - - # Search for the nearest neighbors. - result = collection.search(query, n=5) - - # Print the result. - print("Nearest neighbors ID: {}".format(result[0].id)) -``` - -# 🎯 Benchmarks - -OasysDB uses a built-in benchmarking suite using Rust's [Criterion](https://docs.rs/criterion) crate which we use to measure the performance of the vector database. - -Currently, the benchmarks are focused on the performance of the collection's vector search functionality. We are working on adding more benchmarks to measure the performance of other operations. - -If you are curious and want to run the benchmarks, you can use the command below to run the benchmarks. If you do run it, please share the results with us 😉 - -```bash -cargo bench -``` - -## Memory Usage - -OasysDB uses HNSW which is known to be a memory hog compared to other indexing algorithms. We decided to use it because of its performance even when storing large datasets of vectors with high dimension. - -If you are curious about the memory usage of OasysDB, you can use the command below to run the memory usage measurement script. You can tweak the parameters in the `examples/measure-memory.rs` file to see how the memory usage changes. - -```bash -cargo run --example measure-memory -``` - -## Recall Rate - -In vector databases, recall is the percentage of relevant items that are successfully retrieved compared to the true set of relevant items also known as the ground truth. - -To measure the recall rate, you can use the command below to run the recall rate measurement script. You can tweak the parameters in the `examples/measure-recall.rs` to see how OasysDB performs under different requirements. - -```bash -cargo run --example measure-recall -``` - -Note: This script uses random vector records to measure the recall rate. This might not represent the real-world performance of OasysDB with proper datasets. - -# 🤝 Contributing - -The easiest way to contribute to this project is to star this project and share it with your friends. This will help us grow the community and make the project more visible to others. - -If you want to go further and contribute your expertise, we will gladly welcome your code contributions. For more information and guidance about this, please see [contributing.md](/docs/contributing.md). - -If you have deep experience in the space but don't have the free time to contribute codes, we also welcome advices, suggestions, or feature requests. We are also looking for advisors to help guide the project direction and roadmap. - -If you are interested about the project in any way, please join us on [Discord](https://discord.gg/bDhQrkqNP4). Help us grow the community and make OasysDB better 😁 - -## Code of Conduct - -We are committed to creating a welcoming community. Any participant in our project is expected to act respectfully and to follow the [Code of Conduct](/docs/code_of_conduct.md). - -## Disclaimer - -This project is still in the early stages of development. We are actively working on it and we expect the API and functionality to change. We do not recommend using this in production yet. diff --git a/requirements.txt b/requirements.txt index bc03e0e5..28d36d10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,2 @@ -pytest==8.0.2 -black==24.3.0 -flake8==7.0.0 -asyncio==3.4.3 - # Documentation website. mkdocs-material==9.5.26 diff --git a/src/db/database.rs b/src/db/database.rs deleted file mode 100644 index 4e5fd1fb..00000000 --- a/src/db/database.rs +++ /dev/null @@ -1,279 +0,0 @@ -use super::*; - -/// The directory where collections are stored in the database. -const COLLECTIONS_DIR: &str = "collections"; - -/// The directory to store temporary files. -const TMP_DIR: &str = "tmp"; - -/// The database record for the persisted vector collection. -#[derive(Serialize, Deserialize, Debug)] -pub struct CollectionRecord { - /// Name of the collection. - pub name: String, - /// File path where the collection is stored. - pub path: String, - /// Number of vector records in the collection. - pub count: usize, - /// Timestamp when the collection was created. - pub created_at: usize, - /// Timestamp when the collection was last updated. - pub updated_at: usize, -} - -/// The database storing vector collections. -#[cfg_attr(feature = "py", pyclass(module = "oasysdb.database"))] -pub struct Database { - collections: Db, - count: usize, - path: String, -} - -/// Python only methods. -#[cfg(feature = "py")] -#[pymethods] -impl Database { - #[staticmethod] - #[pyo3(name = "new")] - fn py_new(path: &str) -> PyResult { - Self::new(path).map_err(|e| e.into()) - } - - #[new] - fn py_open(path: &str) -> PyResult { - Self::open(path).map_err(|e| e.into()) - } - - fn __len__(&self) -> usize { - self.len() - } -} - -// Mixed Rust and Python methods. -#[cfg_attr(feature = "py", pymethods)] -impl Database { - /// Gets a collection from the database. - /// * `name`: Name of the collection. - pub fn get_collection(&self, name: &str) -> Result { - // Retrieve the collection record from the database. - let record: CollectionRecord = match self.collections.get(name)? { - Some(value) => bincode::deserialize(&value)?, - None => return Err(Error::collection_not_found()), - }; - - self.read_from_file(&record.path) - } - - /// Saves new or update existing collection to the database. - /// * `name`: Name of the collection. - /// * `collection`: Vector collection to save. - pub fn save_collection( - &mut self, - name: &str, - collection: &Collection, - ) -> Result<(), Error> { - // This variable is required since some operations require - // the write_to_file method to succeed. - let mut new = false; - - let mut record: CollectionRecord; - let path: String; - - // Check if it's a new collection. - if !self.collections.contains_key(name)? { - new = true; - path = self.create_new_collection_path(name)?; - - // Create a new collection record. - let timestamp = self.get_timestamp(); - record = CollectionRecord { - name: name.to_string(), - path: path.clone(), - count: collection.len(), - created_at: timestamp, - updated_at: timestamp, - }; - } else { - let bytes = self.collections.get(name)?.unwrap().to_vec(); - record = bincode::deserialize(&bytes)?; - path = record.path.clone(); - - // Update the record values. - record.count = collection.len(); - record.updated_at = self.get_timestamp(); - } - - // Write the collection to a file. - self.write_to_file(&path, collection)?; - - // Insert or update the collection record in the database. - let bytes = bincode::serialize(&record)?; - self.collections.insert(name, bytes)?; - - // If it's a new collection, update the count. - if new { - self.count += 1; - } - - Ok(()) - } - - /// Deletes a collection from the database. - /// * `name`: Collection name to delete. - pub fn delete_collection(&mut self, name: &str) -> Result<(), Error> { - let record: CollectionRecord = match self.collections.get(name)? { - Some(value) => bincode::deserialize(&value)?, - None => return Err(Error::collection_not_found()), - }; - - // Delete the collection file first before removing - // the reference from the database. - self.delete_file(&record.path)?; - - self.collections.remove(name)?; - self.count -= 1; - Ok(()) - } - - /// Returns the number of collections in the database. - pub fn len(&self) -> usize { - self.count - } - - /// Returns true if the database is empty. - pub fn is_empty(&self) -> bool { - self.count == 0 - } - - /// Flushes dirty IO buffers and syncs the data to disk. - /// Returns bytes flushed. - pub fn flush(&self) -> Result { - let bytes = self.collections.flush()?; - Ok(bytes) - } - - /// Asynchronously performs flush operation. - pub async fn async_flush(&self) -> Result { - let bytes = self.collections.flush_async().await?; - Ok(bytes) - } -} - -impl Database { - /// Re-creates and opens the database at the given path. - /// This method will delete the database if it exists. - /// * `path`: Directory to store the database. - pub fn new(path: &str) -> Result { - // Remove the database dir if it exists. - if Path::new(path).exists() { - fs::remove_dir_all(path)?; - } - - // Setup the directory where collections will be stored. - Self::setup_collections_dir(path)?; - - // Using sled::Config to prevent name collisions - // with collection's Config. - let config = sled::Config::new().path(path); - let collections = config.open()?; - Ok(Self { collections, count: 0, path: path.to_string() }) - } - - /// Opens existing or creates new database. - /// If the database doesn't exist, it will be created. - /// * `path`: Directory to store the database. - pub fn open(path: &str) -> Result { - let collections = sled::open(path)?; - let count = collections.len(); - Self::setup_collections_dir(path)?; - Ok(Self { collections, count, path: path.to_string() }) - } - - /// Serializes and writes the collection to a file. - /// * `path`: File path to write the collection to. - /// * `collection`: Vector collection to write. - fn write_to_file( - &self, - path: &str, - collection: &Collection, - ) -> Result<(), Error> { - // Get the file name from the path. - let filename = Path::new(path).file_name().ok_or(Error { - kind: ErrorKind::IOError, - message: format!("Unable to retrieve file name: {path}"), - })?; - - // Write the collection to a temporary file first. - // This is to prevent data corruption if the process is interrupted. - let temp_path = Path::new(&self.path).join(TMP_DIR).join(filename); - let file = OpenOptions::new() - .create(true) - .write(true) - .truncate(true) - .open(&temp_path)?; - - let writer = BufWriter::new(file); - bincode::serialize_into(writer, collection)?; - - // Rename the temporary file to the original path. - // This operation is atomic and will replace the original file. - fs::rename(&temp_path, path)?; - - Ok(()) - } - - /// Reads and deserializes the collection from a file. - /// * `path`: File path to read the collection from. - fn read_from_file(&self, path: &str) -> Result { - let file = OpenOptions::new().read(true).open(path)?; - let reader = BufReader::new(file); - - // Deserialize the collection. - let collection = bincode::deserialize_from(reader)?; - Ok(collection) - } - - /// Deletes a file at the given path. - fn delete_file(&self, path: &str) -> Result<(), Error> { - fs::remove_file(path)?; - Ok(()) - } - - /// Returns the path where the collection will be stored. - /// * `name`: Name of the collection. - fn create_new_collection_path(&self, name: &str) -> Result { - // Hash the collection name to create a unique filename. - let mut hasher = DefaultHasher::new(); - name.hash(&mut hasher); - let filename = hasher.finish(); - - let path = Path::new(&self.path) - .join(COLLECTIONS_DIR) - .join(filename.to_string()) - .to_str() - .unwrap() - .to_string(); - - Ok(path) - } - - /// Creates the collections directory on the path if it doesn't exist. - fn setup_collections_dir(path: &str) -> Result<(), Error> { - let collections_dir = Path::new(path).join(COLLECTIONS_DIR); - let temp_dir = Path::new(path).join(TMP_DIR); - if !collections_dir.exists() { - fs::create_dir_all(collections_dir)?; - fs::create_dir_all(temp_dir)?; - } - - Ok(()) - } - - /// Returns the UNIX timestamp in milliseconds. - fn get_timestamp(&self) -> usize { - let now = SystemTime::now(); - // We can unwrap safely since UNIX_EPOCH is always valid. - let timestamp = now.duration_since(UNIX_EPOCH).unwrap(); - timestamp.as_millis() as usize - } -} diff --git a/src/db/mod.rs b/src/db/mod.rs deleted file mode 100644 index ca2fcddf..00000000 --- a/src/db/mod.rs +++ /dev/null @@ -1,16 +0,0 @@ -/// The vector database storing collections. -pub mod database; - -use crate::collection::*; -use crate::func::err::{Error, ErrorKind}; -use serde::{Deserialize, Serialize}; -use sled::Db; -use std::collections::hash_map::DefaultHasher; -use std::fs::{self, OpenOptions}; -use std::hash::{Hash, Hasher}; -use std::io::{BufReader, BufWriter}; -use std::path::Path; -use std::time::{SystemTime, UNIX_EPOCH}; - -#[cfg(feature = "py")] -use pyo3::prelude::*; diff --git a/src/func/collection.rs b/src/func/collection.rs deleted file mode 100644 index 18a140d3..00000000 --- a/src/func/collection.rs +++ /dev/null @@ -1,1029 +0,0 @@ -use super::*; - -/// The collection HNSW index configuration. -#[cfg_attr(feature = "py", pyclass(module = "oasysdb.collection", get_all))] -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct Config { - /// Nodes to consider during construction. - pub ef_construction: usize, - /// Nodes to consider during search. - pub ef_search: usize, - /// Layer multiplier. The optimal value is `1/ln(M)`. - pub ml: f32, - /// Distance calculation function. - pub distance: Distance, -} - -// Any modifications to this methods should be reflected in: -// - py/tests/test_collection.py -// - py/oasysdb/collection.pyi -#[cfg(feature = "py")] -#[pymethods] -impl Config { - #[new] - fn py_new( - ef_construction: usize, - ef_search: usize, - ml: f32, - distance: &str, - ) -> Result { - Self::new(ef_construction, ef_search, ml, distance) - } - - #[setter(ef_construction)] - fn py_set_ef_construction(&mut self, ef_construction: usize) { - self.ef_construction = ef_construction; - } - - #[setter(ef_search)] - fn py_set_ef_search(&mut self, ef_search: usize) { - self.ef_search = ef_search; - } - - #[setter(ml)] - fn py_set_ml(&mut self, ml: f32) { - self.ml = ml; - } - - #[setter(distance)] - fn py_set_distance(&mut self, distance: &str) -> Result<(), Error> { - self.set_distance(distance) - } - - #[staticmethod] - fn create_default() -> Self { - Self::default() - } - - #[staticmethod] - #[pyo3(name = "default")] - fn py_default() -> Self { - Self::default() - } - - fn __repr__(&self) -> String { - format!("{:?}", self) - } -} - -impl Config { - /// Creates a new collection config with the given parameters. - pub fn new( - ef_construction: usize, - ef_search: usize, - ml: f32, - distance: &str, - ) -> Result { - let distance = Distance::from(distance)?; - Ok(Self { ef_construction, ef_search, ml, distance }) - } - - /// Sets the distance calculation function. - /// * `distance`: Distance function, e.g. euclidean. - pub fn set_distance(&mut self, distance: &str) -> Result<(), Error> { - self.distance = Distance::from(distance)?; - Ok(()) - } -} - -impl Default for Config { - /// Default configuration for the collection index. - /// * `ef_construction`: 128 - /// * `ef_search`: 64 - /// * `ml`: 0.2885 - /// * `distance`: euclidean - fn default() -> Self { - Self { - ef_construction: 128, - ef_search: 64, - ml: 0.2885, - distance: Distance::Euclidean, - } - } -} - -/// The collection of vector records with HNSW indexing. -#[cfg_attr(feature = "py", pyclass(module = "oasysdb.collection"))] -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct Collection { - /// The collection configuration object. - pub config: Config, - /// The maximum distance to consider a neighbor. - pub relevancy: f32, - // Private fields below. - data: HashMap, - vectors: HashMap, - slots: Vec, - base_layer: Vec, - upper_layers: Vec>, - // Utility fields. - count: usize, - dimension: usize, -} - -impl Index<&VectorID> for Collection { - type Output = Vector; - fn index(&self, index: &VectorID) -> &Self::Output { - &self.vectors[index] - } -} - -// Python only methods. -#[cfg(feature = "py")] -#[pymethods] -impl Collection { - #[new] - fn py_new(config: &Config) -> Self { - Self::new(config) - } - - #[staticmethod] - fn from_records( - config: &Config, - records: Vec, - ) -> Result { - Self::build(config, &records) - } - - #[staticmethod] - #[pyo3(name = "build")] - fn py_build( - config: &Config, - records: Vec, - ) -> Result { - Self::build(config, &records) - } - - #[pyo3(name = "insert_many")] - fn py_insert_many( - &mut self, - records: Vec, - ) -> Result, Error> { - let ids = self.insert_many(&records)?; - Ok(ids) - } - - #[pyo3(name = "filter")] - fn py_filter( - &self, - filters: &str, - ) -> Result, Error> { - let filters = Filters::from(filters); - self.filter(&filters) - } - - #[pyo3(name = "search_with_filters")] - fn py_search_with_filters( - &self, - vector: &Vector, - n: usize, - filters: &str, - ) -> Result, Error> { - let filters = Filters::from(filters); - self.search_with_filters(vector, n, &filters) - } - - #[pyo3(name = "true_search_with_filters")] - fn py_true_search_with_filters( - &self, - vector: &Vector, - n: usize, - filters: &str, - ) -> Result, Error> { - let filters = Filters::from(filters); - self.true_search_with_filters(vector, n, &filters) - } - - #[getter(config)] - fn py_config(&self) -> Config { - self.config.clone() - } - - #[getter(dimension)] - fn py_dimension(&self) -> usize { - self.dimension - } - - #[setter(dimension)] - fn py_set_dimension(&mut self, dimension: usize) -> Result<(), Error> { - self.set_dimension(dimension) - } - - #[getter(relevancy)] - fn py_relevancy(&self) -> f32 { - self.relevancy - } - - #[setter(relevancy)] - fn py_set_relevancy(&mut self, relevancy: f32) { - self.relevancy = relevancy; - } -} - -// This exposes Collection methods to both Python and Rust. -// Any modifications to these methods should be reflected in: -// - py/tests/test_collection.py -// - py/oasysdb/collection.pyi -#[cfg_attr(feature = "py", pymethods)] -impl Collection { - /// Rebuilds the collection index from the current data. - pub fn rebuild(&self) -> Result { - let data_to_record = |(id, data): (&VectorID, &Metadata)| { - let vector = &self.vectors[id]; - Record::new(vector, data) - }; - - let records: Vec = - self.data.par_iter().map(data_to_record).collect(); - - Self::build(&self.config, &records) - } - - /// Inserts a vector record into the collection, and return `VectorID` if success. - /// * `record`: Vector record to insert. - pub fn insert(&mut self, record: &Record) -> Result { - // Ensure the number of records is within the limit. - if self.slots.len() == u32::MAX as usize { - return Err(Error::collection_limit()); - } - - // Ensure the vector dimension matches the collection config. - // If it's the first record, set the dimension. - if self.vectors.is_empty() && self.dimension == 0 { - self.dimension = record.vector.len(); - } else if record.vector.len() != self.dimension { - let len = record.vector.len(); - let err = Error::invalid_dimension(len, self.dimension); - return Err(err); - } - - // Create a new vector ID using the next available slot. - let id: VectorID = self.slots.len().into(); - - // Insert the new vector and data. - self.vectors.insert(id, record.vector.clone()); - self.data.insert(id, record.data.clone()); - - // Add new vector id to the slots. - self.slots.push(id); - - // Update the collection count. - self.count += 1; - - // This operation is last because it depends on - // the updated vectors data. - self.insert_to_layers(&[id]); - - Ok(id) - } - - /// Deletes a vector record from the collection. - /// * `id`: Vector ID to delete. - pub fn delete(&mut self, id: &VectorID) -> Result<(), Error> { - // Ensure the vector ID exists in the collection. - if !self.contains(id) { - return Err(Error::record_not_found()); - } - - self.delete_from_layers(&[*id]); - - // Update the collection data. - self.vectors.remove(id); - self.data.remove(id); - - // Make the slot invalid so it won't be used again. - self.slots[id.0 as usize] = INVALID; - - // Update the collection count. - self.count -= 1; - - Ok(()) - } - - /// Returns vector records in the collection as a HashMap. - pub fn list(&self) -> Result, Error> { - // Early return if the collection is empty. - if self.vectors.is_empty() { - return Ok(HashMap::new()); - } - - // Map the vectors to a hashmap of records. - let mapper = |(id, vector): (&VectorID, &Vector)| { - let data = self.data[id].clone(); - let record = Record::new(vector, &data); - (*id, record) - }; - - let records = self.vectors.par_iter().map(mapper).collect(); - Ok(records) - } - - /// Returns the vector record associated with the ID. - /// * `id`: Vector ID to retrieve. - pub fn get(&self, id: &VectorID) -> Result { - if !self.contains(id) { - return Err(Error::record_not_found()); - } - - let vector = self.vectors[id].clone(); - let data = self.data[id].clone(); - Ok(Record::new(&vector, &data)) - } - - /// Updates a vector record in the collection. - /// * `id`: Vector ID to update. - /// * `record`: New vector record. - pub fn update( - &mut self, - id: &VectorID, - record: &Record, - ) -> Result<(), Error> { - if !self.contains(id) { - return Err(Error::record_not_found()); - } - - // Validate the new vector dimension. - self.validate_dimension(&record.vector)?; - - // Remove the old vector from the index layers. - self.delete_from_layers(&[*id]); - - // Insert the updated vector and data. - self.vectors.insert(*id, record.vector.clone()); - self.data.insert(*id, record.data.clone()); - self.insert_to_layers(&[*id]); - - Ok(()) - } - - /// Searches the collection for the nearest neighbors. - /// * `vector`: Vector to search. - /// * `n`: Number of neighbors to return. - pub fn search( - &self, - vector: &Vector, - n: usize, - ) -> Result, Error> { - // Early return if the collection is empty. - if self.vectors.is_empty() { - return Ok(vec![]); - } - - // Ensure the vector dimension matches the collection dimension. - self.validate_dimension(vector)?; - - let entrypoint = { - let slots_iter = self.slots.as_slice().into_par_iter(); - match slots_iter.find_first(|id| id.is_valid()) { - Some(id) => id, - None => { - let kind = ErrorKind::CollectionError; - let message = "Unable to initiate search."; - return Err(Error::new(&kind, message)); - } - } - }; - - self.search_from_layers(vector, entrypoint, &self.vectors, n) - } - - /// Searches the collection for the true nearest neighbors. - /// * `vector`: Vector to search. - /// * `n`: Number of neighbors to return. - pub fn true_search( - &self, - vector: &Vector, - n: usize, - ) -> Result, Error> { - let filters = Filters::NONE; - self.true_search_with_filters(vector, n, &filters) - } - - /// Returns the number of vector records in the collection. - pub fn len(&self) -> usize { - self.count - } - - /// Returns true if the collection is empty. - pub fn is_empty(&self) -> bool { - self.count == 0 - } - - /// Checks if the collection contains a vector ID. - /// * `id`: Vector ID to check. - pub fn contains(&self, id: &VectorID) -> bool { - self.vectors.contains_key(id) - } - - fn __len__(&self) -> usize { - self.len() - } -} - -impl Collection { - /// Creates an empty collection with the given configuration. - pub fn new(config: &Config) -> Self { - Self { - count: 0, - dimension: 0, - relevancy: -1.0, - config: config.clone(), - data: HashMap::new(), - vectors: HashMap::new(), - slots: vec![], - base_layer: vec![], - upper_layers: vec![], - } - } - - /// Builds the collection index from vector records. - /// * `config`: Collection configuration. - /// * `records`: List of vectors to build the index from. - pub fn build(config: &Config, records: &[Record]) -> Result { - if records.is_empty() { - return Ok(Self::new(config)); - } - - // Ensure the number of records is within the limit. - if records.len() >= u32::MAX as usize { - let kind = ErrorKind::CollectionError; - let message = format!( - "The collection record limit is {}. Given: {}", - u32::MAX, - records.len() - ); - - return Err(Error::new(&kind, &message)); - } - - // Ensure that the vector dimension is consistent. - let dimension = records[0].vector.len(); - if records.par_iter().any(|i| i.vector.len() != dimension) { - let kind = ErrorKind::CollectionError; - let message = format!( - "The vector dimension is inconsistent. Expected: {}.", - dimension - ); - - return Err(Error::new(&kind, &message)); - } - - // Find the number of layers. - - let mut len = records.len(); - let mut layers = Vec::new(); - - loop { - let next = (len as f32 * config.ml) as usize; - - if next < M { - break; - } - - layers.push((len - next, len)); - len = next; - } - - layers.push((len, len)); - layers.reverse(); - - let num_layers = layers.len(); - let top_layer = LayerID(num_layers - 1); - - // Give all vectors a random layer and sort the list of nodes - // by descending order for construction. - - // This allows us to copy higher layers to lower layers as - // construction progresses, while preserving randomness in - // each point's layer and insertion order. - - let vectors = records - .par_iter() - .enumerate() - .map(|(i, item)| (i.into(), item.vector.clone())) - .collect::>(); - - // Figure out how many nodes will go on each layer. - // This helps us allocate memory capacity for each - // layer in advance, and also helps enable batch - // insertion of points. - - let mut ranges = Vec::with_capacity(top_layer.0); - for (i, (size, cumulative)) in layers.into_iter().enumerate() { - let start = cumulative - size; - let layer_id = LayerID(num_layers - i - 1); - let value = max(start, 1)..cumulative; - ranges.push((layer_id, value)); - } - - // Create index constructor. - - let search_pool = SearchPool::new(vectors.len(), config.distance); - let mut upper_layers = vec![vec![]; top_layer.0]; - let base_layer = vectors - .par_iter() - .map(|_| RwLock::new(BaseNode::default())) - .collect::>(); - - let state = IndexConstruction { - base_layer: &base_layer, - search_pool, - top_layer, - vectors: &vectors, - config, - }; - - // Initialize data for layers. - - for (layer, range) in ranges { - let end = range.end; - - range.into_par_iter().for_each(|i: usize| { - state.insert(&i.into(), &layer, &upper_layers) - }); - - // Copy the base layer state to the upper layer. - if !layer.is_zero() { - (&state.base_layer[..end]) - .into_par_iter() - .map(|zero| UpperNode::from_zero(&zero.read())) - .collect_into_vec(&mut upper_layers[layer.0 - 1]); - } - } - - let data = records - .iter() - .enumerate() - .map(|(i, item)| (i.into(), item.data.clone())) - .collect(); - - // Unwrap the base nodes for the base layer. - let base_iter = base_layer.into_par_iter(); - let base_layer = base_iter.map(|node| node.into_inner()).collect(); - - // Add IDs to the slots. - let slots = (0..vectors.len()).map(|i| i.into()).collect(); - - Ok(Self { - data, - vectors, - base_layer, - upper_layers, - slots, - dimension, - config: config.clone(), - count: records.len(), - relevancy: -1.0, - }) - } - - /// Inserts multiple vector records into the collection. - /// * `records`: List of vector records to insert. - pub fn insert_many( - &mut self, - records: &[Record], - ) -> Result, Error> { - // Make sure the collection is not full after inserting. - if self.slots.len() + records.len() >= u32::MAX as usize { - return Err(Error::collection_limit()); - } - - // Sets the collection dimension if it's the first record. - if self.vectors.is_empty() && self.dimension == 0 { - self.dimension = records[0].vector.len(); - } - - // Validate the vector dimension against the collection. - if records.par_iter().any(|i| i.vector.len() != self.dimension) { - let kind = ErrorKind::CollectionError; - let message = format!( - "The vector dimension is inconsistent. Expected: {}.", - self.dimension - ); - - return Err(Error::new(&kind, &message)); - } - - // Create new vector IDs for the records. - let ids: Vec = { - let first_id = self.slots.len(); - let final_id = self.slots.len() + records.len(); - (first_id..final_id).map(|i| i.into()).collect() - }; - - // Store the new records vector and data. - for (id, record) in ids.iter().zip(records.iter()) { - self.vectors.insert(*id, record.vector.clone()); - self.data.insert(*id, record.data.clone()); - } - - // Add new vector IDs to the slots. - self.slots.extend(ids.clone()); - - // Update the collection count. - self.count += records.len(); - - self.insert_to_layers(&ids); - Ok(ids) - } - - /// Filters the collection metadata to get matching vector records. - /// * `filters`: Filters to apply to the metadata. - pub fn filter( - &self, - filters: &Filters, - ) -> Result, Error> { - // Map the metadata HashMap to vector records HashMap. - let create_record_map = |(id, data): (&VectorID, &Metadata)| { - let vector = self.vectors[id].clone(); - let record = Record::new(&vector, data); - (*id, record) - }; - - let records = self - .data - .par_iter() - .filter(|(_, data)| filters.match_metadata(data)) - .map(create_record_map) - .collect(); - - Ok(records) - } - - /// Searches the collection for the nearest neighbors with filters. - /// * `vector`: Vector to search. - /// * `n`: Number of neighbors to return. - /// * `filters`: Filters to apply to the metadata. - pub fn search_with_filters( - &self, - vector: &Vector, - n: usize, - filters: &Filters, - ) -> Result, Error> { - // Early return if the collection is empty. - if self.vectors.is_empty() { - return Ok(vec![]); - } - - // Ensure the vector dimension matches the collection dimension. - self.validate_dimension(vector)?; - - // Filter the vectors based on the metadata if filters are provided. - let vectors = if filters == &Filters::NONE { - self.vectors.clone() - } else { - self.data - .par_iter() - .filter(|(_, data)| filters.match_metadata(data)) - .map(|(id, _)| (*id, self.vectors[id].clone())) - .collect() - }; - - // Perform a brute-force search if the number of vectors is small. - if vectors.len() <= self.config.ef_search { - return self.true_search_with_filters(vector, n, filters); - } - - // Collect the vector IDs from the layers. - let vector_ids_in_layer: Vec = - if !self.upper_layers.is_empty() { - let top_layer = LayerID(self.upper_layers.len() - 1); - let nodes = &self.upper_layers[top_layer.0]; - nodes.iter().map(|node| node.0[0]).collect() - } else { - self.base_layer.iter().map(|node| node.0[0]).collect() - }; - - // Collection vector IDs in the filtered vector data. - let filtered_vector_ids: Vec = - vectors.keys().cloned().collect(); - - // Get the intersection of the vector IDs. - let vectors_ids: Vec = filtered_vector_ids - .into_par_iter() - .filter(|id| vector_ids_in_layer.contains(id)) - .collect(); - - let entrypoint = if vectors_ids.is_empty() { - self.select_search_entrypoint() - } else { - vectors_ids[random::() % vectors_ids.len()] - }; - - self.search_from_layers(vector, &entrypoint, &vectors, n) - } - - /// Searches the collection for the true nearest neighbors with filters. - /// * `vector`: Vector to search. - /// * `n`: Number of neighbors to return. - /// * `filters`: Filters to apply to the metadata. - pub fn true_search_with_filters( - &self, - vector: &Vector, - n: usize, - filters: &Filters, - ) -> Result, Error> { - let mut nearest = Vec::with_capacity(self.vectors.len()); - - // Ensure the vector dimension matches the collection dimension. - self.validate_dimension(vector)?; - - // Filter the vectors based on the metadata if filters are provided. - let vectors = if filters == &Filters::NONE { - self.vectors.clone() - } else { - self.data - .par_iter() - .filter(|(_, data)| filters.match_metadata(data)) - .map(|(id, _)| (*id, self.vectors[id].clone())) - .collect() - }; - - // Calculate the distance between the query and each record. - // Then, create a search result for each record. - for (id, vec) in vectors.iter() { - let distance = self.config.distance.calculate(vector, vec); - let data = self.data[id].clone(); - let res = SearchResult { id: id.0, distance, data }; - nearest.push(res); - } - - // Sort the results by distance in ascending order. - // The closest the distance, the better the match. - let sort_ascending = |a: &SearchResult, b: &SearchResult| { - a.distance.partial_cmp(&b.distance).unwrap() - }; - - nearest.par_sort_by(sort_ascending); - - // Remove irrelevant results and truncate the list. - let mut res = self.truncate_irrelevant_result(nearest); - res.truncate(n); - Ok(res) - } - - /// Returns the configured vector dimension of the collection. - pub fn dimension(&self) -> usize { - self.dimension - } - - /// Sets the vector dimension of the collection. - /// * `dimension`: New vector dimension. - pub fn set_dimension(&mut self, dimension: usize) -> Result<(), Error> { - // This can only be set if the collection is empty. - if !self.vectors.is_empty() { - let kind = ErrorKind::CollectionError; - let message = "Collection must be empty to set dimension."; - return Err(Error::new(&kind, message)); - } - - self.dimension = dimension; - Ok(()) - } - - /// Sets the min/max relevancy for the search results. - /// * `relevancy`: Relevancy score. - pub fn set_relevancy(&mut self, relevancy: f32) { - self.relevancy = relevancy; - } - - /// Validates a vector dimension against the collection's. - fn validate_dimension(&self, vector: &Vector) -> Result<(), Error> { - let found = vector.len(); - let expected = self.dimension; - - if found != expected { - Err(Error::invalid_dimension(found, expected)) - } else { - Ok(()) - } - } - - /// Inserts vector IDs into the index layers. - fn insert_to_layers(&mut self, ids: &[VectorID]) { - // Add new nodes to the base layer. - for _ in 0..ids.len() { - self.base_layer.push(BaseNode::default()); - } - - let base_layer = self - .base_layer - .par_iter() - .map(|node| RwLock::new(*node)) - .collect::>(); - - let top_layer = match self.upper_layers.is_empty() { - true => LayerID(0), - false => LayerID(self.upper_layers.len()), - }; - - // Create a new index construction state. - let state = IndexConstruction { - top_layer, - base_layer: base_layer.as_slice(), - vectors: &self.vectors, - config: &self.config, - search_pool: SearchPool::new( - self.vectors.len(), - self.config.distance, - ), - }; - - // Insert all vectors into the state. - for id in ids { - state.insert(id, &top_layer, &self.upper_layers); - } - - // Update base layer using the new state. - let iter = state.base_layer.into_par_iter(); - self.base_layer = iter.map(|node| *node.read()).collect(); - } - - /// Removes vector IDs from all index layers. - fn delete_from_layers(&mut self, ids: &[VectorID]) { - // Remove the vectors from the base layer. - for id in ids { - let base_node = &mut self.base_layer[id.0 as usize]; - let index = base_node.par_iter().position_first(|x| *x == *id); - if let Some(index) = index { - base_node.set(index, &INVALID); - } - } - - // Remove the vector from the upper layers. - for layer in LayerID(self.upper_layers.len()).descend() { - let upper_layer = match layer.0 > 0 { - true => &mut self.upper_layers[layer.0 - 1], - false => break, - }; - - for id in ids { - let node = &mut upper_layer[id.0 as usize]; - let index = node.0.par_iter().position_first(|x| *x == *id); - if let Some(index) = index { - node.set(index, &INVALID); - } - } - } - } - - fn search_from_layers( - &self, - vector: &Vector, - entrypoint: &VectorID, - vectors: &HashMap, - n: usize, - ) -> Result, Error> { - let mut search = Search::new(0, self.config.distance); - search.visited.resize_capacity(vectors.len()); - search.push(entrypoint, vector, vectors); - - for layer in LayerID(self.upper_layers.len()).descend() { - search.ef = self.config.ef_search; - - if layer.is_zero() { - let layer = self.base_layer.as_slice(); - search.search(layer, vector, vectors, M * 2); - } else { - let layer = self.upper_layers[layer.0 - 1].as_slice(); - search.search(layer, vector, vectors, M); - search.cull(); - } - } - - let map_result = |candidate: Candidate| { - let id = candidate.vector_id.0; - let distance = candidate.distance.0; - let data = self.data[&candidate.vector_id].clone(); - SearchResult { id, distance, data } - }; - - // Truncate the list based on the relevancy score. - let res = search.iter().map(map_result).collect(); - let mut relevant = self.truncate_irrelevant_result(res); - relevant.truncate(n); - Ok(relevant) - } - - /// Truncates the search result based on the relevancy score. - fn truncate_irrelevant_result( - &self, - result: Vec, - ) -> Vec { - // Early return if the relevancy score is not set. - if self.relevancy == -1.0 { - return result; - } - - result - .into_par_iter() - .filter(|r| r.distance <= self.relevancy) - .collect() - } - - /// Selects a random vector ID to start the search. - fn select_search_entrypoint(&self) -> VectorID { - if !self.upper_layers.is_empty() { - let top_layer = LayerID(self.upper_layers.len() - 1); - let nodes = &self.upper_layers[top_layer.0]; - let random_node = nodes[random::() % nodes.len()]; - random_node.0[0] - } else { - let index = random::() % self.base_layer.len(); - let node = &self.base_layer[index]; - node.0[0] - } - } -} - -/// A record containing a vector and its associated data. -#[cfg_attr(feature = "py", pyclass(module = "oasysdb.collection", get_all))] -#[derive(Serialize, Deserialize, Clone, Debug)] -pub struct Record { - /// The vector embedding. - pub vector: Vector, - /// Data associated with the vector. - pub data: Metadata, -} - -// Any modifications to the Python methods should be reflected in: -// - py/tests/test_collection.py -// - py/oasysdb/collection.pyi -#[cfg(feature = "py")] -#[pymethods] -impl Record { - #[new] - fn py_new(vector: Vec, data: &PyAny) -> Self { - let vector = Vector::from(vector); - let data = Metadata::from(data); - Self::new(&vector, &data) - } - - #[setter(vector)] - fn py_set_vector(&mut self, vector: Vec) { - self.vector = Vector::from(vector); - } - - #[setter(data)] - fn py_set_data(&mut self, data: &PyAny) -> Result<(), Error> { - self.data = Metadata::from(data); - Ok(()) - } - - #[staticmethod] - #[pyo3(name = "random")] - fn py_random(dimension: usize) -> Self { - Record::random(dimension) - } - - #[staticmethod] - #[pyo3(name = "many_random")] - fn py_many_random(dimension: usize, len: usize) -> Vec { - Record::many_random(dimension, len) - } - - fn __repr__(&self) -> String { - format!("{:?}", self) - } -} - -impl Record { - /// Creates a new record with a vector and data. - pub fn new(vector: &Vector, data: &Metadata) -> Self { - Self { vector: vector.clone(), data: data.clone() } - } - - /// Generates a random record for testing. - /// * `dimension`: Vector dimension. - pub fn random(dimension: usize) -> Self { - let vector = Vector::random(dimension); - let data = random::().into(); - Self::new(&vector, &data) - } - - /// Generates many random records for testing. - /// * `dimension`: Vector dimension. - /// * `len`: Number of records to generate. - pub fn many_random(dimension: usize, len: usize) -> Vec { - (0..len).map(|_| Self::random(dimension)).collect() - } -} - -/// The collection nearest neighbor search result. -#[cfg_attr(feature = "py", pyclass(module = "oasysdb.collection", get_all))] -#[derive(Serialize, Deserialize, Debug, PartialEq)] -pub struct SearchResult { - /// Vector ID. - pub id: u32, - /// Distance between the query to the collection vector. - pub distance: f32, - /// Data associated with the vector. - pub data: Metadata, -} - -#[cfg(feature = "py")] -impl SearchResult { - fn __repr__(&self) -> String { - format!("{:?}", self) - } -} diff --git a/src/func/distance.rs b/src/func/distance.rs deleted file mode 100644 index 95d28d78..00000000 --- a/src/func/distance.rs +++ /dev/null @@ -1,63 +0,0 @@ -use super::*; - -/// The distance function used for similarity calculations. -#[derive(Debug, Serialize, Deserialize, Clone, Copy)] -#[derive(PartialEq, Eq, Hash, PartialOrd, Ord)] -pub enum Distance { - /// Euclidean distance function. - Euclidean, - /// Cosine distance function (1 - Cosine similarity). - Cosine, -} - -impl Distance { - /// Creates a new distance function from a string. - /// Available options: - /// * `euclidean`: Euclidean distance function. - /// * `cosine`: Cosine similarity function. - pub fn from(distance: &str) -> Result { - match distance { - "euclidean" => Ok(Distance::Euclidean), - "cosine" => Ok(Distance::Cosine), - _ => Err(Error::invalid_distance()), - } - } - - /// Calculates the distance between two vectors. - pub fn calculate(&self, a: &Vector, b: &Vector) -> f32 { - assert_eq!(a.0.len(), b.0.len()); - match self { - Distance::Euclidean => Distance::euclidean(a, b), - Distance::Cosine => Distance::cosine(a, b), - } - } - - // List additional distance functions below. - - fn cosine(a: &Vector, b: &Vector) -> f32 { - f32::cosine(&a.0, &b.0).unwrap() as f32 - } - - fn euclidean(a: &Vector, b: &Vector) -> f32 { - let sq = f32::sqeuclidean(&a.0, &b.0).unwrap() as f32; - sq.sqrt() - } -} - -#[cfg(feature = "py")] -impl From<&PyAny> for Distance { - fn from(distance: &PyAny) -> Self { - let distance = distance.str().unwrap().to_string(); - Distance::from(&distance).unwrap() - } -} - -#[cfg(feature = "py")] -impl IntoPy> for Distance { - fn into_py(self, py: Python) -> Py { - match self { - Distance::Euclidean => "euclidean".into_py(py), - Distance::Cosine => "cosine".into_py(py), - } - } -} diff --git a/src/func/err.rs b/src/func/err.rs deleted file mode 100644 index 7bbe4d9b..00000000 --- a/src/func/err.rs +++ /dev/null @@ -1,160 +0,0 @@ -use std::fmt::{Display, Formatter, Result}; - -// Other error types. -use bincode::ErrorKind as BincodeError; -use sled::Error as SledError; -use std::error::Error as StandardError; -use std::io::Error as IOError; - -#[cfg(feature = "py")] -use super::*; - -#[cfg(feature = "py")] -use pyo3::exceptions::PyValueError; - -#[cfg(feature = "gen")] -use reqwest::Error as ReqwestError; - -#[cfg(feature = "json")] -use serde_json::Error as JSONError; - -/// The type of error. -#[allow(missing_docs)] -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ErrorKind { - StandardError, - IOError, - DatabaseError, - CollectionError, - DistanceError, - SerializationError, - RequestError, -} - -/// A custom error object with error type and message. -#[derive(Debug)] -pub struct Error { - /// Type of error. - pub kind: ErrorKind, - /// Why the error occurred. - pub message: String, -} - -impl Error { - /// Create a new error with the given message. - pub fn new(kind: &ErrorKind, message: &str) -> Self { - Self { kind: *kind, message: message.to_string() } - } - - /// Returns the error message. - pub fn message(&self) -> &str { - &self.message - } - - // Common errors. - - /// Creates error: The collection is not found. - pub fn collection_not_found() -> Self { - let message = "The collection is not found."; - let kind = ErrorKind::DatabaseError; - Error::new(&kind, message) - } - - /// Creates error when the collection record limit is reached. - pub fn collection_limit() -> Self { - let max = u32::MAX; - let brief = "The collection limit is reached."; - let detail = format!("The max number of records is {max}."); - - let message = format!("{brief} {detail}"); - let kind = ErrorKind::CollectionError; - Error::new(&kind, &message) - } - - /// Creates error when vector record is not found. - pub fn record_not_found() -> Self { - let message = "The vector record is not found."; - let kind = ErrorKind::CollectionError; - Error::new(&kind, message) - } - - /// Creates error when getting vector with invalid dimension. - pub fn invalid_dimension(found: usize, expected: usize) -> Self { - let brief = "Invalid vector dimension."; - let detail = format!("Expected {expected}, found {found}."); - - let message = format!("{brief} {detail}"); - let kind = ErrorKind::CollectionError; - Error::new(&kind, &message) - } - - /// Error when the distance function is not supported. - pub fn invalid_distance() -> Self { - let message = "Distance function not supported."; - let kind = ErrorKind::DistanceError; - Error::new(&kind, message) - } -} - -impl Display for Error { - fn fmt(&self, f: &mut Formatter) -> Result { - let kind = &self.kind; - let message = &self.message; - write!(f, "{kind:?}: {message}") - } -} - -// Interoperability with other error types. - -impl StandardError for Error {} - -impl From> for Error { - fn from(err: Box) -> Self { - let kind = ErrorKind::StandardError; - Error::new(&kind, &err.to_string()) - } -} - -impl From for Error { - fn from(err: SledError) -> Self { - let kind = ErrorKind::DatabaseError; - Error::new(&kind, &err.to_string()) - } -} - -impl From for Error { - fn from(err: IOError) -> Self { - let kind = ErrorKind::IOError; - Error::new(&kind, &err.to_string()) - } -} - -impl From> for Error { - fn from(err: Box) -> Self { - let kind = ErrorKind::SerializationError; - Error::new(&kind, &err.to_string()) - } -} - -#[cfg(feature = "py")] -impl From for PyErr { - fn from(err: Error) -> Self { - PyErr::new::(err.message) - } -} - -#[cfg(feature = "gen")] -impl From for Error { - fn from(err: ReqwestError) -> Self { - let kind = ErrorKind::RequestError; - Error::new(&kind, &err.to_string()) - } -} - -#[cfg(feature = "json")] -impl From for Error { - fn from(err: JSONError) -> Self { - let kind = ErrorKind::SerializationError; - Error::new(&kind, &err.to_string()) - } -} diff --git a/src/func/filter.rs b/src/func/filter.rs deleted file mode 100644 index 661a2b48..00000000 --- a/src/func/filter.rs +++ /dev/null @@ -1,419 +0,0 @@ -use super::*; - -const AND: &str = " AND "; -const OR: &str = " OR "; - -/// The filters to apply to the collection metadata. -#[derive(Debug, PartialEq)] -pub enum Filters { - /// Results must match all filters. - AND(Vec), - /// Results must match at least one filter. - OR(Vec), - /// No filters to apply. - NONE, -} - -impl Filters { - /// Matches the collection metadata against the filters. - pub fn match_metadata(&self, metadata: &Metadata) -> bool { - let evaluate = |f: &Filter| f.match_metadata(metadata); - match self { - Filters::AND(filters) => filters.iter().all(evaluate), - Filters::OR(filters) => filters.iter().any(evaluate), - Filters::NONE => true, - } - } -} - -impl From<&str> for Filters { - fn from(filters: &str) -> Self { - if filters.is_empty() { - return Filters::NONE; - } - - // Check which join operator is used. - let or_count = filters.matches(OR).count(); - let and_count = filters.matches(AND).count(); - - let join = if or_count > 0 && and_count > 0 { - panic!("Mixing AND and OR join operators is not supported."); - } else if or_count > 0 { - OR - } else { - // If no join operator is found, use AND since it doesn't matter. - AND - }; - - // Split the filters. - let filters = filters.split(join).map(Into::into).collect(); - match join { - OR => Filters::OR(filters), - _ => Filters::AND(filters), - } - } -} - -impl From for Filters { - fn from(filters: String) -> Self { - Filters::from(filters.as_str()) - } -} - -/// The basic filter operator to use to compare with metadata. -#[allow(missing_docs)] -#[derive(Debug, Clone, PartialEq)] -pub enum FilterOperator { - Equal, - NotEqual, - GreaterThan, - GreaterThanOrEqual, - LessThan, - LessThanOrEqual, - Contains, -} - -// String type representing the filter key type. -// This helps us prevent typos and make the code more readable. -const TEXT: &str = "text"; -const INTEGER: &str = "integer"; -const FLOAT: &str = "float"; -const BOOLEAN: &str = "boolean"; -const ARRAY: &str = "array"; -const OBJECT: &str = "object"; - -/// The filter to match against the collection metadata. -#[derive(Debug, PartialEq)] -pub struct Filter { - /// Metadata key to filter. - pub key: String, - /// The filter value to match against. - pub value: Metadata, - /// Filter operator to use for matching. - pub operator: FilterOperator, -} - -impl Filter { - /// Creates a new filter instance. - /// * `key`: Key to filter. - /// * `value`: Value to use for filtering. - /// * `operator`: Filter operator. - pub fn new(key: &str, value: &Metadata, operator: &FilterOperator) -> Self { - Self::validate_filter(key, value, operator); - Self { - key: key.to_string(), - value: value.clone(), - operator: operator.clone(), - } - } - - /// Matches the collection metadata against the filter. - pub fn match_metadata(&self, metadata: &Metadata) -> bool { - let key_parts: Vec<&str> = self.key.split('.').collect(); - let key_type = key_parts[0]; - - match key_type { - TEXT => self.match_text(metadata), - INTEGER => self.match_integer(metadata), - FLOAT => self.match_float(metadata), - BOOLEAN => self.match_boolean(metadata), - ARRAY => self.match_array(metadata), - OBJECT => self.match_object(metadata), - // This should never happen because we validate the key type. - _ => panic!("Unsupported filter key type: {key_type}"), - } - } - - fn match_text(&self, metadata: &Metadata) -> bool { - let text = match metadata { - Metadata::Text(text) => text, - _ => return false, - }; - - let filter_text = match &self.value { - Metadata::Text(text) => text, - _ => return false, - }; - - match &self.operator { - FilterOperator::Equal => text == filter_text, - FilterOperator::NotEqual => text != filter_text, - FilterOperator::Contains => text.contains(filter_text), - _ => false, - } - } - - fn match_integer(&self, metadata: &Metadata) -> bool { - let int = match metadata { - Metadata::Integer(int) => int, - _ => return false, - }; - - let filter_int = match &self.value { - Metadata::Integer(int) => int, - _ => return false, - }; - - match &self.operator { - FilterOperator::Equal => int == filter_int, - FilterOperator::NotEqual => int != filter_int, - FilterOperator::GreaterThan => int > filter_int, - FilterOperator::GreaterThanOrEqual => int >= filter_int, - FilterOperator::LessThan => int < filter_int, - FilterOperator::LessThanOrEqual => int <= filter_int, - _ => false, - } - } - - fn match_float(&self, metadata: &Metadata) -> bool { - let float = match metadata { - Metadata::Float(float) => float, - _ => return false, - }; - - let filter_float = match &self.value { - Metadata::Float(float) => float, - _ => return false, - }; - - match &self.operator { - FilterOperator::Equal => float == filter_float, - FilterOperator::NotEqual => float != filter_float, - FilterOperator::GreaterThan => float > filter_float, - FilterOperator::GreaterThanOrEqual => float >= filter_float, - FilterOperator::LessThan => float < filter_float, - FilterOperator::LessThanOrEqual => float <= filter_float, - _ => false, - } - } - - fn match_boolean(&self, metadata: &Metadata) -> bool { - let bool = match metadata { - Metadata::Boolean(bool) => bool, - _ => return false, - }; - - let filter_bool = match &self.value { - Metadata::Boolean(bool) => bool, - _ => return false, - }; - - match &self.operator { - FilterOperator::Equal => bool == filter_bool, - FilterOperator::NotEqual => bool != filter_bool, - _ => false, - } - } - - fn match_array(&self, metadata: &Metadata) -> bool { - let array = match metadata { - Metadata::Array(arr) => arr, - _ => return false, - }; - - match &self.operator { - FilterOperator::Contains => array.contains(&self.value), - _ => self.match_array_value(array), - } - } - - fn match_array_value(&self, array: &[Metadata]) -> bool { - let key_parts: Vec<&str> = self.key.split('.').collect(); - - // This has been validated in the `validate_filter` method. - // So, we can safely unwrap it here. - let index = key_parts[1].parse::().unwrap(); - let value = match array.get(index) { - Some(value) => value, - None => return false, - }; - - self.match_subvalue(value) - } - - fn match_object(&self, metadata: &Metadata) -> bool { - let object = match metadata { - Metadata::Object(obj) => obj, - _ => return false, - }; - - let key_parts: Vec<&str> = self.key.split('.').collect(); - let value = match object.get(key_parts[1]) { - Some(value) => value, - None => return false, - }; - - self.match_subvalue(value) - } - - /// Creates a sub-filter to match primitive value of an array or object. - fn match_subvalue(&self, value: &Metadata) -> bool { - // We expect the value we retrieve from the object to be a primitive type. - // So we create a sub-filter to match the value. - let subfilter_key = match value { - Metadata::Text(_) => TEXT, - Metadata::Integer(_) => INTEGER, - Metadata::Float(_) => FLOAT, - Metadata::Boolean(_) => BOOLEAN, - _ => panic!("Unsupported 2nd level array or object to filter."), - }; - - let subfilter = Filter::new(subfilter_key, &self.value, &self.operator); - subfilter.match_metadata(value) - } - - /// Validates the key with the supported value and filter operator. - /// * `key`: Filter key. - /// * `value`: Filter metadata value. - /// * `operator`: Filter operator. - fn validate_filter(key: &str, value: &Metadata, operator: &FilterOperator) { - // Check if the key is valid. - if key.is_empty() { - panic!("Filter key cannot be empty."); - } - - let key_parts: Vec<&str> = key.split('.').collect(); - let key_type = key_parts[0]; - - // Check if the key is valid. - let valid_types = [TEXT, INTEGER, FLOAT, BOOLEAN, ARRAY, OBJECT]; - if !valid_types.contains(&key_type) { - panic!("Invalid filter key type: {key_type}"); - } - - // Check if the key has a sub-key for object type. - if key_type == OBJECT { - if key_parts.len() != 2 { - panic!("Object key must have exactly one sub-key."); - } - - if key_parts[1].is_empty() { - panic!("Object sub-key must be a non-empty string."); - } - } - - // Validate key for array type. - if key_type == ARRAY { - if operator != &FilterOperator::Contains && key_parts.len() != 2 { - panic!("Array filter must provide an index."); - } - - if key_parts.len() == 2 && key_parts[1].parse::().is_err() { - panic!("Array filter index must be a valid integer."); - } - } - - Self::validate_value(key_type, value); - Self::validate_operator(key_type, operator); - } - - // Validates the filter value based on the key type. - fn validate_value(key_type: &str, value: &Metadata) { - // Prevent array and object types for value. - // Because, we should handle it like this: object.key = value - match value { - Metadata::Array(_) | Metadata::Object(_) => { - panic!("Unsupported array or object type as value.") - } - // We handle the primitive types validation below. - _ => {} - } - - // Array and object keys are always valid because we will validate - // the value type when performing the filter. - let always_valid_key_types = [ARRAY, OBJECT]; - if always_valid_key_types.contains(&key_type) { - return; - } - - // Error message for invalid filter value type. - let panic = - || panic!("Invalid filter value of {value:?} for key: {key_type}"); - - // For key types other than array and object, - // we need to validate the value type. - match value { - Metadata::Text(_) => { - if key_type != TEXT { - panic(); - } - } - Metadata::Integer(_) => { - if key_type != INTEGER { - panic(); - } - } - Metadata::Float(_) => { - if key_type != FLOAT { - panic(); - } - } - Metadata::Boolean(_) => { - if key_type != BOOLEAN { - panic(); - } - } - // Array and object values has been handled above. - _ => {} - } - } - - /// Validates the filter operator based on the key type. - fn validate_operator(key_type: &str, operator: &FilterOperator) { - match operator { - // Contains operator is only valid for text, array, and object types. - FilterOperator::Contains => { - let valid_types = [TEXT, ARRAY, OBJECT]; - if !valid_types.contains(&key_type) { - panic!("Invalid CONTAINS operator for key: {key_type}"); - } - } - // Numeric operators are not valid for text and boolean types. - FilterOperator::GreaterThan - | FilterOperator::GreaterThanOrEqual - | FilterOperator::LessThan - | FilterOperator::LessThanOrEqual => { - let invalid_types = [TEXT, BOOLEAN]; - if invalid_types.contains(&key_type) { - panic!("Invalid numeric operator for key type: {key_type}"); - } - } - // Equal and not equal are valid for all types. - _ => {} - } - } -} - -impl From<&str> for Filter { - fn from(filter: &str) -> Self { - if filter.is_empty() { - panic!("Filter string cannot be empty."); - } - - // Split the filter string into EXACTLY 3 parts. - let parts: Vec<&str> = filter.splitn(3, ' ').collect(); - let parts: Vec<&str> = parts.into_iter().map(|p| p.trim()).collect(); - - // Get and validate the filter operator. - let operator = match parts[1] { - "=" => FilterOperator::Equal, - "!=" => FilterOperator::NotEqual, - ">" => FilterOperator::GreaterThan, - ">=" => FilterOperator::GreaterThanOrEqual, - "<" => FilterOperator::LessThan, - "<=" => FilterOperator::LessThanOrEqual, - "CONTAINS" => FilterOperator::Contains, - _ => panic!("Invalid filter operator: {}", parts[1]), - }; - - let key = parts[0].to_string(); - let value = Metadata::from(parts[2]); - Self::new(&key, &value, &operator) - } -} - -impl From for Filter { - fn from(filter: String) -> Self { - Filter::from(filter.as_str()) - } -} diff --git a/src/func/metadata.rs b/src/func/metadata.rs deleted file mode 100644 index 8e62115d..00000000 --- a/src/func/metadata.rs +++ /dev/null @@ -1,259 +0,0 @@ -use super::*; - -#[cfg(feature = "json")] -use serde_json::{Map, Number, Value}; - -/// The metadata associated with a vector record. -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] -pub enum Metadata { - /// A piece of text like article title or description. - Text(String), - /// An integer number such as external IDs. - Integer(usize), - /// A float number to represent something like a score. - Float(f32), - /// A boolean value often used for states like active or viewed. - Boolean(bool), - /// An array containing any type of metadata. - Array(Vec), - /// A map of string and metadata pairs. The most common type. - Object(HashMap), -} - -// Implement the conversion from the Rust primitive types to Metadata. - -impl From for Metadata { - fn from(value: usize) -> Self { - Metadata::Integer(value) - } -} - -impl From for Metadata { - fn from(value: f32) -> Self { - Metadata::Float(value) - } -} - -impl From for Metadata { - fn from(value: String) -> Self { - Metadata::from(value.as_str()) - } -} - -impl From<&str> for Metadata { - fn from(value: &str) -> Self { - if let Ok(int) = value.parse::() { - return Metadata::Integer(int); - } - - if let Ok(float) = value.parse::() { - return Metadata::Float(float); - } - - if let Ok(bool) = value.parse::() { - return Metadata::Boolean(bool); - } - - Metadata::Text(value.to_string()) - } -} - -impl From for Metadata { - fn from(value: bool) -> Self { - Metadata::Boolean(value) - } -} - -impl From> for Metadata -where - Metadata: From, -{ - fn from(value: Vec) -> Self { - let arr = value.into_iter().map(|v| v.into()).collect(); - Metadata::Array(arr) - } -} - -impl From> for Metadata -where - Metadata: From, -{ - fn from(value: HashMap) -> Self { - let iter = value.into_iter(); - let obj = iter.map(|(k, v)| (k, v.into())).collect(); - Metadata::Object(obj) - } -} - -impl From> for Metadata -where - Metadata: From, -{ - fn from(value: HashMap<&str, T>) -> Self { - let iter = value.into_iter(); - let obj = iter.map(|(k, v)| (k.into(), v.into())).collect(); - Metadata::Object(obj) - } -} - -// This implementation allows conversion from -// JSON Value type to the Metadata enum. -#[cfg(feature = "json")] -impl From for Metadata { - fn from(value: Value) -> Self { - // Cast JSON number to Metadata float or integer. - let convert_number = |number: Number| { - // Check if the number is float. If not, it's an integer. - if number.is_f64() { - let float = number.as_f64().unwrap(); - Metadata::Float(float as f32) - } else { - let int = number.as_i64().unwrap(); - Metadata::Integer(int as usize) - } - }; - - // Cast JSON array to Metadata array. - let convert_array = |array: Vec| { - let vec = array.into_iter().map(|v| v.into()).collect(); - Metadata::Array(vec) - }; - - // Cast JSON object to Metadata object. - let convert_object = |object: Map| { - let map = object - .into_iter() - .map(|(k, v)| (k, v.into())) - .collect::>(); - Metadata::Object(map) - }; - - match value { - Value::String(text) => Metadata::Text(text), - Value::Number(number) => convert_number(number), - Value::Array(array) => convert_array(array), - Value::Object(object) => convert_object(object), - Value::Bool(bool) => Metadata::Boolean(bool), - _ => panic!("Unsupported JSON type for the metadata."), - } - } -} - -// This implementation allows conversion from -// the native Metadata enum to JSON Value. -#[cfg(feature = "json")] -impl From for Value { - fn from(metadata: Metadata) -> Self { - // Convert Metadata integer to JSON number. - let convert_integer = |int: usize| { - let number = Number::from(int as i64); - Value::Number(number) - }; - - // Convert Metadata float to JSON number. - let convert_float = |float: f32| { - let number = Number::from_f64(float as f64).unwrap(); - Value::Number(number) - }; - - // Convert Metadata array to JSON array. - let convert_array = |arr: Vec| { - let vec = arr.into_iter().map(|v| v.into()).collect(); - Value::Array(vec) - }; - - // Convert Metadata object to JSON object. - let convert_object = |obj: HashMap| { - let map = obj - .into_iter() - .map(|(k, v)| (k, v.into())) - .collect::>(); - Value::Object(map) - }; - - match metadata { - Metadata::Text(text) => Value::String(text), - Metadata::Integer(int) => convert_integer(int), - Metadata::Float(float) => convert_float(float), - Metadata::Boolean(bool) => Value::Bool(bool), - Metadata::Array(array) => convert_array(array), - Metadata::Object(object) => convert_object(object), - } - } -} - -// This implementation attempts to convert the -// Python object into the Metadata enum. -#[cfg(feature = "py")] -impl From<&PyAny> for Metadata { - fn from(value: &PyAny) -> Self { - // Extract string. - if let Ok(text) = value.extract::() { - return Metadata::Text(text); - } - - // Extract integer. - if let Ok(int) = value.extract::() { - return Metadata::Integer(int); - } - - // Extract float. - if let Ok(float) = value.extract::() { - return Metadata::Float(float); - } - - // Extract boolean. - if let Ok(bool) = value.extract::() { - return Metadata::Boolean(bool); - } - - // Extract list. - if let Ok(list) = value.extract::>() { - let arr = list.into_iter().map(|v| v.into()).collect(); - return Metadata::Array(arr); - } - - // Extract dictionary. - if let Ok(dict) = value.extract::>() { - let obj = dict.into_iter().map(|(k, v)| (k, v.into())).collect(); - return Metadata::Object(obj); - } - - // Throw an error if the type is not supported. - panic!("Unsupported type for the metadata."); - } -} - -// This implementation converts the Metadata -// enum back to the Python object. -#[cfg(feature = "py")] -impl IntoPy> for Metadata { - fn into_py(self, py: Python) -> Py { - // Convert array of Metadata to Python list. - let list_converter = |vec: Vec| { - let list = vec - .into_iter() - .map(|metadata: Metadata| metadata.into_py(py)) - .collect::>>(); - list.into_py(py) - }; - - // Convert HashMap of Metadata to Python dictionary. - let dict_converter = |map: HashMap| { - let dict = map - .into_iter() - .map(|(key, value)| (key, value.into_py(py))) - .collect::>>(); - dict.into_py(py) - }; - - match self { - Metadata::Text(text) => text.into_py(py), - Metadata::Integer(int) => int.into_py(py), - Metadata::Float(float) => float.into_py(py), - Metadata::Boolean(bool) => bool.into_py(py), - Metadata::Array(arr) => list_converter(arr), - Metadata::Object(obj) => dict_converter(obj), - } - } -} diff --git a/src/func/mod.rs b/src/func/mod.rs deleted file mode 100644 index 89f2fed1..00000000 --- a/src/func/mod.rs +++ /dev/null @@ -1,43 +0,0 @@ -/// The collection of vectors and their data. -pub mod collection; -/// Enum for the collection distance functions. -pub mod distance; -/// Error types for the database. -pub mod err; -/// Filter operations for the collection metadata. -pub mod filter; -/// Types for the metadata. -pub mod metadata; -/// Types for the vectors. -pub mod vector; - -// Internal modules. -mod utils; - -use collection::*; -use distance::*; -use err::*; -use filter::*; -use metadata::*; -use utils::*; -use vector::*; - -// External dependencies. -use ordered_float::OrderedFloat; -use parking_lot::*; -use rand::random; -use rayon::prelude::*; -use serde::{Deserialize, Serialize}; -use serde_big_array::BigArray; -use simsimd::SpatialSimilarity; -use std::cmp::*; -use std::collections::{BinaryHeap, HashMap}; -use std::ops::{Deref, Index}; - -#[cfg(feature = "py")] -use pyo3::prelude::*; - -// This code is inspired by the HNSW implementation in the -// Instant Distance library and modified to fit the needs -// of this project. -// https://github.com/instant-labs/instant-distance diff --git a/src/func/utils.rs b/src/func/utils.rs deleted file mode 100644 index 0407161d..00000000 --- a/src/func/utils.rs +++ /dev/null @@ -1,457 +0,0 @@ -use super::*; - -pub const INVALID: VectorID = VectorID(u32::MAX); - -/// The M value for the HNSW algorithm. -pub const M: usize = 32; - -pub trait Layer { - type Slice: Deref; - fn nearest_iter(&self, vector_id: &VectorID) -> NearestIter; -} - -pub struct NearestIter { - node: T, - current: usize, -} - -impl> NearestIter { - pub fn new(node: T) -> Self { - Self { node, current: 0 } - } -} - -impl> Iterator for NearestIter { - type Item = VectorID; - fn next(&mut self) -> Option { - if self.current >= self.node.len() { - return None; - } - - let item = self.node[self.current]; - if !item.is_valid() { - self.current = self.node.len(); - return None; - } - - self.current += 1; - Some(item) - } -} - -struct DescendingLayerIter { - next: Option, -} - -impl Iterator for DescendingLayerIter { - type Item = LayerID; - fn next(&mut self) -> Option { - let current_next = self.next?; - - let next = if current_next == 0 { - self.next = None; - 0 - } else { - self.next = Some(current_next - 1); - current_next - }; - - Some(LayerID(next)) - } -} - -#[derive(Clone, Copy, Debug)] -#[derive(Eq, Hash, Ord, PartialEq, PartialOrd)] -pub struct LayerID(pub usize); - -impl LayerID { - pub fn descend(&self) -> impl Iterator { - DescendingLayerIter { next: Some(self.0) } - } - - pub fn is_zero(&self) -> bool { - self.0 == 0 - } -} - -#[derive(Serialize, Deserialize, Clone, Copy, Debug)] -pub struct BaseNode(#[serde(with = "BigArray")] pub [VectorID; M * 2]); - -impl Default for BaseNode { - fn default() -> Self { - Self([INVALID; M * 2]) - } -} - -impl BaseNode { - pub fn allocate(&mut self, mut iter: impl Iterator) { - for slot in self.0.iter_mut() { - if let Some(vector_id) = iter.next() { - *slot = vector_id; - } else if *slot != INVALID { - *slot = INVALID; - } else { - break; - } - } - } - - /// Inserts a vector ID to the base node at the index. - pub fn insert(&mut self, index: usize, vector_id: &VectorID) { - // Make sure the index is within the limit. - if index >= self.0.len() { - return; - } - - // Shift the vector IDs to accommodate the new one at the index. - if self.0[index].is_valid() { - let end = M * 2 - 1; - self.0.copy_within(index..end, index + 1); - } - - self.set(index, vector_id) - } - - /// Sets the vector ID at the index. - pub fn set(&mut self, index: usize, vector_id: &VectorID) { - self.0[index] = *vector_id; - } -} - -impl Index<&VectorID> for [RwLock] { - type Output = RwLock; - fn index(&self, index: &VectorID) -> &Self::Output { - &self[index.0 as usize] - } -} - -impl Deref for BaseNode { - type Target = [VectorID]; - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl<'a> Layer for &'a [BaseNode] { - type Slice = &'a [VectorID]; - fn nearest_iter(&self, vector_id: &VectorID) -> NearestIter { - NearestIter::new(&self[vector_id.0 as usize]) - } -} - -impl<'a> Layer for &'a [RwLock] { - type Slice = MappedRwLockReadGuard<'a, [VectorID]>; - fn nearest_iter(&self, vector_id: &VectorID) -> NearestIter { - NearestIter::new(RwLockReadGuard::map( - self[vector_id.0 as usize].read(), - Deref::deref, - )) - } -} - -#[derive(Serialize, Deserialize, Clone, Copy, Debug)] -pub struct UpperNode(#[serde(with = "BigArray")] pub [VectorID; M]); - -impl UpperNode { - pub fn from_zero(node: &BaseNode) -> Self { - let mut nearest = [INVALID; M]; - nearest.copy_from_slice(&node.0[..M]); - Self(nearest) - } - - pub fn set(&mut self, index: usize, vector_id: &VectorID) { - self.0[index] = *vector_id; - } -} - -impl<'a> Layer for &'a [UpperNode] { - type Slice = &'a [VectorID]; - fn nearest_iter(&self, vector_id: &VectorID) -> NearestIter { - NearestIter::new(&self[vector_id.0 as usize].0) - } -} - -#[derive(Clone)] -pub struct Visited { - store: Vec, - generation: u8, -} - -impl Visited { - /// Creates a new visited object with the capacity. - pub fn with_capacity(capacity: usize) -> Self { - Self { store: vec![0; capacity], generation: 1 } - } - - pub fn resize_capacity(&mut self, capacity: usize) { - if self.store.len() != capacity { - self.store.resize(capacity, self.generation - 1); - } - } - - /// Inserts a vector ID into the visited object. - pub fn insert(&mut self, vector_id: &VectorID) -> bool { - let slot = match self.store.get_mut(vector_id.0 as usize) { - Some(slot) => slot, - None => return false, - }; - - if *slot != self.generation { - *slot = self.generation; - return true; - } - - false - } - - /// Inserts multiple vector IDs into the visited object. - pub fn extend(&mut self, iter: impl Iterator) { - for vector_id in iter { - self.insert(&vector_id); - } - } - - pub fn clear(&mut self) { - if self.generation < 249 { - self.generation += 1; - return; - } - - self.store.clear(); - self.store.resize(self.store.len(), 0); - self.generation = 1; - } -} - -/// Candidate for the nearest neighbors. -#[derive(Clone, Copy, Debug)] -#[derive(Eq, Ord, PartialEq, PartialOrd)] -pub struct Candidate { - pub distance: OrderedFloat, - pub vector_id: VectorID, -} - -#[derive(Clone)] -pub struct Search { - pub ef: usize, - pub visited: Visited, - candidates: BinaryHeap>, - nearest: Vec, // Ordered ascendingly by distance. - distance: Distance, -} - -impl Search { - pub fn new(capacity: usize, distance: Distance) -> Self { - let visited = Visited::with_capacity(capacity); - Self { visited, distance, ..Default::default() } - } - - /// Searches the nearest neighbors in the graph layer. - pub fn search( - &mut self, - layer: L, - vector: &Vector, - vectors: &HashMap, - links: usize, - ) { - while let Some(Reverse(candidate)) = self.candidates.pop() { - if let Some(last) = self.nearest.last() { - if candidate.distance > last.distance { - break; - } - } - - // Get related vector ID of the current candidate - // and consider them candidates too. - let layer_iter = layer.nearest_iter(&candidate.vector_id); - for vector_id in layer_iter.take(links) { - if vectors.contains_key(&vector_id) { - self.push(&vector_id, vector, vectors); - } - } - - self.nearest.truncate(self.ef); - } - } - - /// Creates and pushes a candidate to the nearest field - /// and candidates binary heap fields. - pub fn push( - &mut self, - vector_id: &VectorID, - vector: &Vector, - vectors: &HashMap, - ) { - if !vectors.contains_key(vector_id) || !self.visited.insert(vector_id) { - return; - } - - // Create a new candidate. - let other = &vectors[vector_id]; - let distance = self.distance.calculate(vector, other); - let distance = OrderedFloat(distance); - let new = Candidate { distance, vector_id: *vector_id }; - - // Make sure the index to insert to is within the EF scope. - let index = match self.nearest.binary_search(&new) { - Err(index) if index < self.ef => index, - Err(_) => return, - Ok(_) => unreachable!(), - }; - - self.nearest.insert(index, new); - self.candidates.push(Reverse(new)); - } - - /// Lowers the search to the next lower layer. - pub fn cull(&mut self) { - self.candidates.clear(); - self.visited.clear(); - - for &candidate in self.nearest.iter() { - self.candidates.push(Reverse(candidate)); - } - - let candidates = self.nearest.iter().map(|c| c.vector_id); - self.visited.extend(candidates); - } - - /// Resets the search object data. - pub fn reset(&mut self) { - self.visited.clear(); - self.candidates.clear(); - self.nearest.clear(); - } - - /// Selects the nearest neighbors. - pub fn select_simple(&mut self) -> &[Candidate] { - &self.nearest - } - - pub fn iter(&self) -> impl ExactSizeIterator + '_ { - self.nearest.iter().copied() - } -} - -impl Default for Search { - fn default() -> Self { - Self { - visited: Visited::with_capacity(0), - candidates: BinaryHeap::new(), - nearest: Vec::new(), - ef: 64, - distance: Distance::Euclidean, - } - } -} - -pub struct SearchPool { - pool: Mutex>, - distance: Distance, - len: usize, -} - -impl SearchPool { - pub fn new(len: usize, distance: Distance) -> Self { - let pool = Mutex::new(Vec::new()); - Self { pool, len, distance } - } - - /// Returns the last searches from the pool. - pub fn pop(&self) -> (Search, Search) { - let search = Search::new(self.len, self.distance); - match self.pool.lock().pop() { - Some(result) => result, - None => (search.clone(), search), - } - } - - /// Pushes the searches to the pool. - pub fn push(&self, item: &(Search, Search)) { - self.pool.lock().push(item.clone()); - } -} - -pub struct IndexConstruction<'a> { - pub search_pool: SearchPool, - pub top_layer: LayerID, - pub base_layer: &'a [RwLock], - pub vectors: &'a HashMap, - pub config: &'a Config, -} - -impl<'a> IndexConstruction<'a> { - /// Inserts a vector ID into a layer. - /// * `vector_id`: Vector ID to insert. - /// * `layer`: Layer to insert into. - /// * `layers`: Upper layers. - pub fn insert( - &self, - vector_id: &VectorID, - layer: &LayerID, - layers: &[Vec], - ) { - let vector = &self.vectors[vector_id]; - let dist = self.config.distance; - - let (mut search, mut insertion) = self.search_pool.pop(); - insertion.ef = self.config.ef_construction; - - // Find the first valid vector ID to push. - let validator = |i: u32| self.vectors.get(&i.into()).is_some(); - let valid_id = (0..u32::MAX) - .into_par_iter() - .find_first(|i| validator(*i)) - .unwrap_or(0); - - search.reset(); - search.push(&valid_id.into(), vector, self.vectors); - - for current_layer in self.top_layer.descend() { - search.ef = self.config.ef_construction; - - // Find the nearest neighbor candidates. - if current_layer > *layer { - let layer = layers[current_layer.0 - 1].as_slice(); - search.search(layer, vector, self.vectors, M); - search.cull(); - } else { - search.search(self.base_layer, vector, self.vectors, M * 2); - break; - } - } - - // Select the nearest neighbors to the given vector. - let candidates = { - let candidates = search.select_simple(); - &candidates[..Ord::min(M, candidates.len())] - }; - - for (i, candidate) in candidates.iter().enumerate() { - let candidate_id = candidate.vector_id; - let current = &self.vectors[&candidate_id]; - let distance = candidate.distance; - - // Function to sort the vectors by distance. - let ordering = |id: &VectorID| { - if !id.is_valid() { - return Ordering::Greater; - } - - let other = &self.vectors[id]; - distance.cmp(&dist.calculate(current, other).into()) - }; - - // Find the correct index to insert at to keep the order. - let index = self.base_layer[&candidate_id] - .read() - .binary_search_by(ordering) - .unwrap_or_else(|error| error); - - self.base_layer[&candidate_id].write().insert(index, vector_id); - self.base_layer[vector_id].write().set(i, vector_id); - } - - self.search_pool.push(&(search, insertion)); - } -} diff --git a/src/func/vector.rs b/src/func/vector.rs deleted file mode 100644 index a48b02ec..00000000 --- a/src/func/vector.rs +++ /dev/null @@ -1,164 +0,0 @@ -use super::*; - -/// The ID of a vector record. -#[cfg_attr(feature = "py", pyclass(module = "oasysdb.vector"))] -#[derive(Serialize, Deserialize, Clone, Copy, Debug)] -#[derive(Eq, PartialEq, PartialOrd, Ord, Hash)] -pub struct VectorID(pub u32); - -#[cfg(feature = "py")] -#[pymethods] -impl VectorID { - #[new] - fn py_new(id: u32) -> Self { - id.into() - } - - fn __repr__(&self) -> String { - format!("{self:?}") - } - - fn __str__(&self) -> String { - format!("{self:?}") - } - - fn __eq__(&self, other: &Self) -> bool { - self.0 == other.0 - } - - fn __hash__(&self) -> usize { - self.0 as usize - } -} - -#[cfg_attr(feature = "py", pymethods)] -impl VectorID { - /// True if this vector ID is valid. - pub fn is_valid(&self) -> bool { - self.0 != u32::MAX - } - - /// Returns the vector ID as u32 type. - pub fn to_u32(&self) -> u32 { - self.0 - } - - /// Returns the vector ID as usize type. - pub fn to_usize(&self) -> usize { - self.0 as usize - } -} - -impl From for VectorID { - fn from(id: u32) -> Self { - VectorID(id) - } -} - -impl From for VectorID { - fn from(id: usize) -> Self { - VectorID(id as u32) - } -} - -impl From for u32 { - fn from(v: VectorID) -> Self { - v.0 - } -} - -impl From for usize { - fn from(v: VectorID) -> Self { - v.0 as usize - } -} - -/// The vector embedding of float numbers. -#[cfg_attr(feature = "py", pyclass(module = "oasysdb.vector"))] -#[derive(Serialize, Deserialize, Clone, Debug)] -#[derive(PartialEq, PartialOrd)] -pub struct Vector(pub Vec); - -// Methods available only to Python. -#[cfg(feature = "py")] -#[pymethods] -impl Vector { - #[new] - fn py_new(vector: Vec) -> Self { - vector.into() - } - - fn to_list(&self) -> Vec { - self.0.clone() - } - - #[staticmethod] - #[pyo3(name = "random")] - fn py_random(dimension: usize) -> Self { - Vector::random(dimension) - } - - fn __repr__(&self) -> String { - format!("{:?}", self) - } - - fn __len__(&self) -> usize { - self.len() - } -} - -// Methods available to both Python and Rust. -// If this implementation is modified, make sure to modify: -// - py/tests/test_vector.py -// - py/oasysdb/vector.pyi -#[cfg_attr(feature = "py", pymethods)] -impl Vector { - /// Returns the dimension of the vector. - pub fn len(&self) -> usize { - self.0.len() - } - - /// Returns true if the vector is empty. - pub fn is_empty(&self) -> bool { - self.0.is_empty() - } -} - -impl Vector { - /// Generates a random vector for testing. - /// * `dimension`: Vector dimension. - pub fn random(dimension: usize) -> Self { - let mut vec = vec![0.0; dimension]; - - for float in vec.iter_mut() { - *float = random::(); - } - - vec.into() - } -} - -impl Index<&VectorID> for [Vector] { - type Output = Vector; - fn index(&self, index: &VectorID) -> &Self::Output { - &self[index.0 as usize] - } -} - -impl From> for Vector { - fn from(vec: Vec) -> Self { - Vector(vec) - } -} - -impl From<&Vec> for Vector { - fn from(vec: &Vec) -> Self { - Vector(vec.clone()) - } -} - -impl From for Vec { - fn from(vector: Vector) -> Self { - vector.0 - } -} diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index 693b6b54..00000000 --- a/src/lib.rs +++ /dev/null @@ -1,94 +0,0 @@ -#![warn(missing_docs)] -#![warn(unused_qualifications)] -#![doc = include_str!("../readme.md")] -#![doc(html_favicon_url = "https://i.postimg.cc/W3T230zk/favicon.png")] -#![doc(html_logo_url = "https://i.postimg.cc/Vv0HPVwB/logo.png")] - -#[cfg(test)] -mod tests; - -mod db; -mod func; - -/// Embedding models to generate vectors. -#[cfg(feature = "gen")] -pub mod vectorgen; - -/// Convenience re-exports for the public APIs. -pub mod prelude; - -pub use db::database; -pub use func::collection; -pub use func::distance; -pub use func::err; -pub use func::filter; -pub use func::metadata; -pub use func::vector; - -#[cfg(feature = "py")] -use pyo3::prelude::*; - -#[cfg(feature = "py")] -type Module = fn(Python<'_>, &PyModule) -> PyResult<()>; - -#[cfg(feature = "py")] -#[pymodule] -fn oasysdb(py: Python, m: &PyModule) -> PyResult<()> { - let sys = py.import("sys")?; - let modules = sys.getattr("modules")?; - - let mods: Vec<(&str, Module)> = vec![ - ("collection", collection_modules), - ("vector", vector_modules), - ("database", database_modules), - ("prelude", prelude_modules), - ]; - - for (name, module) in mods { - let full_name = format!("oasysdb.{}", name); - let pymod = PyModule::new(py, &full_name)?; - module(py, pymod)?; - m.add(name, pymod)?; - modules.set_item(full_name, pymod)?; - } - - Ok(()) -} - -#[cfg(feature = "py")] -#[pymodule] -fn collection_modules(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - Ok(()) -} - -#[cfg(feature = "py")] -#[pymodule] -fn vector_modules(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_class::()?; - m.add_class::()?; - Ok(()) -} - -#[cfg(feature = "py")] -#[pymodule] -fn database_modules(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_class::()?; - Ok(()) -} - -#[cfg(feature = "py")] -#[pymodule] -fn prelude_modules(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - Ok(()) -} diff --git a/src/main.rs b/src/main.rs index f328e4d9..57906394 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1 +1,4 @@ +#![warn(missing_docs)] +#![warn(unused_qualifications)] + fn main() {} diff --git a/src/prelude/mod.rs b/src/prelude/mod.rs deleted file mode 100644 index f3611a13..00000000 --- a/src/prelude/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -pub use crate::database::*; -pub use crate::func::collection::*; -pub use crate::func::distance::*; -pub use crate::func::err::*; -pub use crate::func::filter::*; -pub use crate::func::metadata::*; -pub use crate::func::vector::*; diff --git a/src/tests/mod.rs b/src/tests/mod.rs deleted file mode 100644 index 3d7d68fd..00000000 --- a/src/tests/mod.rs +++ /dev/null @@ -1,46 +0,0 @@ -mod test_collection; -mod test_database; -mod test_distance; -mod test_metadata; - -// This test requires the JSON feature to be enabled to make our life -// easier allowing conversion from JSON Value type to the Metadata enum. -#[cfg(feature = "json")] -mod test_filter; - -#[cfg(feature = "gen")] -mod test_vectorgen; - -use crate::prelude::*; -use rayon::prelude::*; -use std::collections::HashMap; - -const DIMENSION: usize = 128; -const LEN: usize = 100; - -/// The test database initial collection name. -const NAME: &str = "vectors"; - -fn create_test_database() -> Database { - let mut db = Database::new("data/rs").unwrap(); - let collection = create_collection(); - db.save_collection(NAME, &collection).unwrap(); - db -} - -fn create_collection() -> Collection { - let all_records = Record::many_random(DIMENSION, LEN); - - // Split the records into two halves. - // The first half is used to build the collection. - // The second half is used to insert. - let mid = LEN / 2; - let first_half = &all_records[0..mid]; - let second_half = &all_records[mid..LEN]; - - let config = Config::default(); - let mut collection = Collection::build(&config, first_half).unwrap(); - - collection.insert_many(second_half).unwrap(); - collection -} diff --git a/src/tests/test_collection.rs b/src/tests/test_collection.rs deleted file mode 100644 index 9499098c..00000000 --- a/src/tests/test_collection.rs +++ /dev/null @@ -1,171 +0,0 @@ -use super::*; - -#[test] -fn new_with_distance() { - let mut config = Config::default(); - config.distance = Distance::Cosine; - let mut collection = Collection::new(&config); - collection.insert(&Record::random(DIMENSION)).unwrap(); -} - -#[test] -fn build_large() { - let len = 10000; - let records = Record::many_random(DIMENSION, len); - let config = Config::default(); - let collection = Collection::build(&config, &records).unwrap(); - assert_eq!(collection.len(), len); -} - -#[test] -fn insert() { - let mut collection = create_collection(); - - // Create a new record to insert. - let new_record = Record::random(DIMENSION); - collection.insert(&new_record).unwrap(); - - // Assert the new record is in the collection. - let id = VectorID::from(LEN); - assert_eq!(collection.len(), LEN + 1); - assert_eq!(collection.get(&id).unwrap().data, new_record.data); -} - -#[test] -fn insert_invalid_dimension() { - let mut collection = create_collection(); - - // Create a new record with an invalid dimension. - let new_record = Record::random(DIMENSION + 1); - - // Assert the new record is not inserted. - assert_eq!(collection.dimension(), DIMENSION); - assert_eq!(collection.insert(&new_record).is_err(), true); -} - -#[test] -fn insert_data_type_object() { - let mut collection = create_collection(); - - // Create a new record with a data of type HashMap. - let vector = Vector::random(DIMENSION); - let data = HashMap::from([("key", "value")]); - let new_record = Record::new(&vector, &data.clone().into()); - - collection.insert(&new_record).unwrap(); - - // Assert the new data is in the collection. - let id = VectorID::from(LEN); - assert_eq!(collection.len(), LEN + 1); - assert_eq!(collection.get(&id).unwrap().data, data.into()); -} - -#[test] -fn insert_many() { - let mut collection = create_collection(); - - // Create records to insert. - let new_records = Record::many_random(DIMENSION, LEN); - let ids = collection.insert_many(&new_records).unwrap(); - - // Assert the new records are in the collection. - assert_eq!(collection.len(), 2 * LEN); - assert_eq!(ids.len(), LEN); - assert_eq!(ids[0], VectorID(LEN as u32)); -} - -#[test] -fn delete() { - let mut collection = create_collection(); - - // Delete a record from the collection. - let id = VectorID(0); - collection.delete(&id).unwrap(); - assert_eq!(collection.len(), LEN - 1); -} - -#[test] -fn update() { - let mut collection = create_collection(); - - // New record to update. - let id = VectorID(5); - let record = Record::random(DIMENSION); - collection.update(&id, &record).unwrap(); - - assert_eq!(collection.len(), LEN); - assert_eq!(collection.get(&id).unwrap().data, record.data); -} - -#[test] -fn search() { - let len = 1000; - let config = Config::default(); - let records = Record::many_random(DIMENSION, len); - - // Build the collection with a minimum relevancy. - let mut collection = Collection::build(&config, &records).unwrap(); - collection.relevancy = 4.5; - - // Generate a random query vector. - let query = Vector::random(DIMENSION); - - // Get the approximate and true nearest neighbors. - let result = collection.search(&query, 5).unwrap(); - let truth = collection.true_search(&query, 10).unwrap(); - - assert_eq!(result.len(), 5); - - // The search is not always exact, so we check if - // the distance is within the true distances. - let distances: Vec = truth.par_iter().map(|i| i.distance).collect(); - assert_eq!(distances.contains(&result[0].distance), true); - - // Search results should be within the relevancy. - let last_result = result.last().unwrap(); - let last_truth = truth.last().unwrap(); - assert!(last_result.distance <= collection.relevancy); - assert!(last_truth.distance <= collection.relevancy); -} - -#[test] -fn get() { - let records = Record::many_random(DIMENSION, LEN); - let config = Config::default(); - let collection = Collection::build(&config, &records).unwrap(); - - // Get a record from the collection. - let index: usize = 5; - let id = VectorID::from(index); - let record = collection.get(&id).unwrap(); - - assert_eq!(record.data, records[index].data); - assert_eq!(record.vector, records[index].vector); -} - -#[test] -fn list() { - let collection = create_collection(); - let list = collection.list().unwrap(); - assert_eq!(list.len(), LEN); - assert_eq!(list.len(), collection.len()); -} - -#[test] -fn config_with_distance() { - let ef = 10; - let ml = 1.0; - for dist in vec!["cosine", "euclidean"] { - Config::new(ef, ef, ml, dist).unwrap(); - } -} - -#[test] -#[should_panic(expected = "Distance function not supported.")] -fn config_with_distance_panic() { - let ef = 10; - let ml = 1.0; - for dist in vec!["l2", "test"] { - Config::new(ef, ef, ml, dist).unwrap(); - } -} diff --git a/src/tests/test_database.rs b/src/tests/test_database.rs deleted file mode 100644 index 614c4b49..00000000 --- a/src/tests/test_database.rs +++ /dev/null @@ -1,67 +0,0 @@ -use super::*; -use futures::executor; - -#[test] -fn new() { - let db = Database::new("data/rs").unwrap(); - assert_eq!(db.len(), 0); -} - -#[test] -fn get_collection() { - let db = create_test_database(); - let collection = db.get_collection(NAME).unwrap(); - assert_eq!(collection.len(), LEN); -} - -#[test] -fn save_collection_new() { - let mut db = Database::new("data/rs").unwrap(); - let len = db.len(); - - // Create a collection from scratch. - let config = Config::default(); - let mut collection = Collection::new(&config); - - // Insert a random record. - let record = Record::random(DIMENSION); - collection.insert(&record).unwrap(); - - db.save_collection("new", &collection).unwrap(); - assert_eq!(collection.len(), 1); - assert_eq!(db.len(), len + 1); -} - -#[test] -fn save_collection_update() { - let mut db = create_test_database(); - - // Update the collection. - let mut collection = db.get_collection(NAME).unwrap(); - collection.insert(&Record::random(DIMENSION)).unwrap(); - - db.save_collection(NAME, &collection).unwrap(); - assert_eq!(collection.len(), LEN + 1); - assert_eq!(db.len(), 1); -} - -#[test] -fn delete_collection() { - let mut db = create_test_database(); - db.delete_collection(NAME).unwrap(); - assert_eq!(db.len(), 0); -} - -#[test] -fn flush() { - let db = create_test_database(); - let bytes = db.flush().unwrap(); - assert!(bytes > 0); -} - -#[test] -fn async_flush() { - let db = create_test_database(); - let bytes = executor::block_on(db.async_flush()).unwrap(); - assert!(bytes > 0); -} diff --git a/src/tests/test_distance.rs b/src/tests/test_distance.rs deleted file mode 100644 index 398c77af..00000000 --- a/src/tests/test_distance.rs +++ /dev/null @@ -1,17 +0,0 @@ -use super::*; - -#[test] -fn distance_calculation() { - let a = Vector::from(vec![1.0, 3.0, 5.0]); - let b = Vector::from(vec![2.0, 4.0, 6.0]); - - let euclidean = Distance::Euclidean.calculate(&a, &b); - let cosine = Distance::Cosine.calculate(&a, &b); - - assert_eq!(euclidean, 1.7320508); - - // When utilizing SIMD, the cosine distance is approximated. - // So we just need to make sure the result is within a certain range. - let diff = cosine - 0.00614136; - assert!(diff < 0.01); -} diff --git a/src/tests/test_filter.rs b/src/tests/test_filter.rs deleted file mode 100644 index 92f80ce4..00000000 --- a/src/tests/test_filter.rs +++ /dev/null @@ -1,151 +0,0 @@ -use crate::prelude::*; -use serde_json::json; - -const DIMENSION: usize = 128; - -fn create_collection_multitype_metadata() -> Collection { - let config = Config::default(); - let mut collection = Collection::new(&config); - - let vectors = vec![Vector::random(DIMENSION); 10]; - - // Text metadata. - let data = "This is awesome!"; - let record = Record::new(&vectors[0], &data.into()); - collection.insert(&record).unwrap(); - - // Integer metadata. - let data = 10; - let record = Record::new(&vectors[2], &data.into()); - collection.insert(&record).unwrap(); - - // Float metadata. - let data = 20.0; - let record = Record::new(&vectors[3], &data.into()); - collection.insert(&record).unwrap(); - - // Boolean metadata. - let data = true; - let record = Record::new(&vectors[4], &data.into()); - collection.insert(&record).unwrap(); - - // Array metadata. - let data = vec![10, 20, 30]; - let record = Record::new(&vectors[5], &data.into()); - collection.insert(&record).unwrap(); - - // Object metadata. - let data = json!({ - "key": "value", - "number": 10, - }); - - let record = Record::new(&vectors[6], &data.into()); - collection.insert(&record).unwrap(); - - collection -} - -#[test] -#[should_panic] -fn text_gt_filter() { - let operator = FilterOperator::GreaterThan; - Filter::new("text", &json!("value").into(), &operator); -} - -#[test] -fn float_lt_filter() { - let operator = FilterOperator::LessThan; - let filter = Filter::new("float", &json!(10.5).into(), &operator); - let filter_from_str = Filter::from("float < 10.5"); - assert_eq!(filter, filter_from_str); -} - -#[test] -fn boolean_neq_filter() { - let operator = FilterOperator::NotEqual; - let filter = Filter::new("boolean", &json!(true).into(), &operator); - let filter_from_str = Filter::from("boolean != true"); - assert_eq!(filter, filter_from_str); -} - -#[test] -fn object_gteq_filter() { - let operator = FilterOperator::GreaterThanOrEqual; - let filter = Filter::new("object.id", &json!(10).into(), &operator); - let filter_from_str = Filter::from("object.id >= 10"); - assert_eq!(filter, filter_from_str); -} - -#[test] -#[should_panic] -fn object_as_value_filter() { - let operator = FilterOperator::GreaterThan; - let value = json!({ "key": "value" }).into(); - Filter::new("object", &value, &operator); -} - -#[test] -fn and_filters() { - let filters = Filters::AND(vec![ - Filter::new("text", &json!("value").into(), &FilterOperator::Equal), - Filter::new("integer", &json!(10).into(), &FilterOperator::GreaterThan), - ]); - - let filters_from_str = Filters::from("text = value AND integer > 10"); - assert_eq!(filters, filters_from_str); -} - -#[test] -fn collection_text_integer_or_filters() { - let collection = create_collection_multitype_metadata(); - let filters = Filters::from("text CONTAINS awesome OR integer > 5"); - let result = collection.filter(&filters).unwrap(); - assert_eq!(result.len(), 2); -} - -#[test] -fn collection_array_filter() { - let collection = create_collection_multitype_metadata(); - let filters = Filters::from("array CONTAINS 20"); - let result = collection.filter(&filters).unwrap(); - assert_eq!(result.len(), 1); - - let filters = Filters::from("array.0 >= 10"); - let result = collection.filter(&filters).unwrap(); - assert_eq!(result.len(), 1); -} - -#[test] -fn collection_object_filter() { - let collection = create_collection_multitype_metadata(); - let filters = Filters::from("object.key CONTAINS val OR object.number > 5"); - let result = collection.filter(&filters).unwrap(); - assert_eq!(result.len(), 1); - - // This should return an empty result since there is no way to store both - // array and object at the same level at the same time. - let filters = Filters::from("object.number = 10 AND array.0 <= 10"); - let result = collection.filter(&filters).unwrap(); - assert_eq!(result.len(), 0); -} - -#[test] -fn collection_object_search_with_filters() { - let collection = create_collection_multitype_metadata(); - - // Search the collection with filters. - let filters = Filters::from("object.number < 25"); - let vector = Vector::random(DIMENSION); - let result = collection.search_with_filters(&vector, 1, &filters).unwrap(); - - // This must match the data we created in - // create_collection_multitype_metadata function above. - let expected_data = json!({ - "key": "value", - "number": 10, - }); - - assert_eq!(result.len(), 1); - assert_eq!(result[0].data, expected_data.into()); -} diff --git a/src/tests/test_metadata.rs b/src/tests/test_metadata.rs deleted file mode 100644 index 0bd09455..00000000 --- a/src/tests/test_metadata.rs +++ /dev/null @@ -1,49 +0,0 @@ -#[allow(unused_imports)] -use super::*; - -#[cfg(feature = "json")] -use serde_json::{json, Value}; - -#[cfg(feature = "json")] -#[test] -fn json_value_to_metadata() { - let map = HashMap::from([("key", "value")]); - let value = json!(map); - - let metadata = Metadata::from(map); - let metadata_from_value = Metadata::from(value); - - assert_eq!(metadata, metadata_from_value); -} - -#[cfg(feature = "json")] -#[test] -fn metadata_to_json_value() { - let map = HashMap::from([("key", "value")]); - let value = json!(map); - - let metadata = Metadata::from(map); - let value_from_metadata = Value::from(metadata); - - assert_eq!(value, value_from_metadata); -} - -#[cfg(feature = "json")] -#[test] -fn insert_data_type_json() { - let mut collection = create_collection(); - - let data = json!({ - "number": 1, - "boolean": true, - "string": "text", - }); - - // Create a new record with JSON data. - let vector = Vector::random(DIMENSION); - let new_record = Record::new(&vector, &data.clone().into()); - let id = collection.insert(&new_record).unwrap(); - - let metadata = Metadata::from(data); - assert_eq!(collection.get(&id).unwrap().data, metadata); -} diff --git a/src/tests/test_vectorgen.rs b/src/tests/test_vectorgen.rs deleted file mode 100644 index 6de45374..00000000 --- a/src/tests/test_vectorgen.rs +++ /dev/null @@ -1,30 +0,0 @@ -use crate::vectorgen::*; -use dotenv::dotenv; -use std::env; - -/// Setup the test environment. -fn setup_environment() { - dotenv().ok(); -} - -/// Get the environment variable by the key. -fn getenv(key: &str) -> String { - let message = format!("Environment variable not found: {key}"); - env::var(key).expect(&message) -} - -fn model_openai() -> OpenAI { - let api_key = getenv("OPENAI_API_KEY"); - let model = "text-embedding-3-small"; - OpenAI::new(&api_key, model) -} - -#[test] -fn openai_create_vector() { - setup_environment(); - let model = model_openai(); - - let content = "OasysDB is awesome!"; - let vector = model.create_vector(content).unwrap(); - assert_eq!(vector.len(), 1536); -} diff --git a/src/vectorgen/mod.rs b/src/vectorgen/mod.rs deleted file mode 100644 index ff246942..00000000 --- a/src/vectorgen/mod.rs +++ /dev/null @@ -1,29 +0,0 @@ -use crate::prelude::*; -use reqwest::blocking::Client; -use serde_json::{json, Value}; - -mod openai; - -// Re-export the model implementations below. -pub use openai::OpenAI; - -/// Trait for embedding models to easily generate vectors. -pub trait EmbeddingModel { - /// Returns the model ID: `provider-name/model-name` - /// - `provider-name`: Model provider like openai, google, etc. - /// - `model-name`: Model name like gpt-3, bert, etc. - fn id(&self) -> &str; - - /// Creates a vector embedding from the given content. - /// - `content`: Text or content URL to generate the vector. - fn create_vector(&self, content: &str) -> Result; - - /// Creates a vector record from content and data. - /// - `content`: Text or content URL to generate the vector. - /// - `data`: Metadata to associate with the vector. - fn create_record( - &self, - content: &str, - data: &Metadata, - ) -> Result; -} diff --git a/src/vectorgen/openai.rs b/src/vectorgen/openai.rs deleted file mode 100644 index c182ab22..00000000 --- a/src/vectorgen/openai.rs +++ /dev/null @@ -1,91 +0,0 @@ -use super::*; - -/// Embedding models provided by OpenAI. -pub struct OpenAI { - /// OpenAI API key. - pub api_key: String, - /// Embedding model name. - pub model: String, - endpoint: String, -} - -impl EmbeddingModel for OpenAI { - fn id(&self) -> &str { - let id = format!("openai/{}", self.model); - Box::leak(id.into_boxed_str()) - } - - fn create_vector(&self, content: &str) -> Result { - self.create_vector(content) - } - - fn create_record( - &self, - content: &str, - data: &Metadata, - ) -> Result { - let vector = self.create_vector(content)?; - let record = Record::new(&vector, data); - Ok(record) - } -} - -impl OpenAI { - /// Creates a new OpenAI embedding model instance. - pub fn new(api_key: &str, model: &str) -> Self { - let valid_models = [ - "text-embedding-3-large", - "text-embedding-3-small", - "text-embedding-ada-002", - ]; - - // Validate the model input. - if !valid_models.contains(&model) { - panic!("Unsupported embedding model: {model}"); - } - - let endpoint = "https://api.openai.com/v1/embeddings"; - - Self { - api_key: api_key.to_string(), - model: model.to_string(), - endpoint: endpoint.to_string(), - } - } - - fn create_vector(&self, content: &str) -> Result { - let bearer = format!("Bearer {}", self.api_key); - - // Create the request body for the API. - // https://platform.openai.com/docs/api-reference/embeddings/create - let body = json!({ - "input": content, - "model": self.model, - }); - - let client = Client::new(); - let response = client - .post(&self.endpoint) - .header("authorization", bearer) - .json(&body) - .send()?; - - // Get the JSON response from the API. - let json: Value = response.json()?; - let embedding = &json["data"][0]["embedding"]; - let vector: Vec = serde_json::from_value(embedding.clone())?; - - Ok(Vector::from(vector)) - } - - /// Set custom endpoint for the OpenAI API. - pub fn with_endpoint(&mut self, endpoint: &str) -> &mut Self { - // Validate the endpoint URL. - if !endpoint.starts_with("https://api.openai.com") { - panic!("Invalid OpenAI API endpoint: {endpoint}"); - } - - self.endpoint = endpoint.to_string(); - self - } -} From 64f7dc8b145ad2f86b187af6fba6c585922015ad Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Fri, 21 Jun 2024 18:37:11 -0500 Subject: [PATCH 02/88] chore: update cargo files --- Cargo.lock | 1970 +--------------------------------------------------- Cargo.toml | 60 +- 2 files changed, 5 insertions(+), 2025 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8c1501c6..7cad23ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,1974 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "addr2line" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "aho-corasick" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" -dependencies = [ - "memchr", -] - -[[package]] -name = "anes" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" - -[[package]] -name = "anstyle" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2faccea4cc4ab4a667ce676a30e8ec13922a692c99bb8f5b11f1502c72e04220" - -[[package]] -name = "autocfg" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" - -[[package]] -name = "backtrace" -version = "0.3.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" -dependencies = [ - "addr2line", - "cc", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - -[[package]] -name = "base64" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" - -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" - -[[package]] -name = "bumpalo" -version = "3.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - -[[package]] -name = "bytes" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" - -[[package]] -name = "cast" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" - -[[package]] -name = "cc" -version = "1.0.83" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" -dependencies = [ - "libc", -] - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "ciborium" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" -dependencies = [ - "ciborium-io", - "ciborium-ll", - "serde", -] - -[[package]] -name = "ciborium-io" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" - -[[package]] -name = "ciborium-ll" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" -dependencies = [ - "ciborium-io", - "half", -] - -[[package]] -name = "clap" -version = "4.4.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c" -dependencies = [ - "clap_builder", -] - -[[package]] -name = "clap_builder" -version = "4.4.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7" -dependencies = [ - "anstyle", - "clap_lex", -] - -[[package]] -name = "clap_lex" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" - -[[package]] -name = "core-foundation" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" -dependencies = [ - "core-foundation-sys", - "libc", -] - -[[package]] -name = "core-foundation-sys" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" - -[[package]] -name = "crc32fast" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "criterion" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" -dependencies = [ - "anes", - "cast", - "ciborium", - "clap", - "criterion-plot", - "is-terminal", - "itertools", - "num-traits", - "once_cell", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", -] - -[[package]] -name = "criterion-plot" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" -dependencies = [ - "cast", - "itertools", -] - -[[package]] -name = "crossbeam-deque" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" -dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" -dependencies = [ - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" - -[[package]] -name = "crunchy" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" - -[[package]] -name = "dotenv" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f" - -[[package]] -name = "either" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" - -[[package]] -name = "encoding_rs" -version = "0.8.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "equivalent" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" - -[[package]] -name = "errno" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "fastrand" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "foreign-types" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" -dependencies = [ - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-shared" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" - -[[package]] -name = "form_urlencoded" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" -dependencies = [ - "percent-encoding", -] - -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi", -] - -[[package]] -name = "futures" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" - -[[package]] -name = "futures-executor" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" - -[[package]] -name = "futures-macro" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" - -[[package]] -name = "futures-task" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" - -[[package]] -name = "futures-util" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "slab", -] - -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - -[[package]] -name = "getrandom" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - -[[package]] -name = "gimli" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" - -[[package]] -name = "h2" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http", - "indexmap", - "slab", - "tokio", - "tokio-util", - "tracing", -] - -[[package]] -name = "half" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" -dependencies = [ - "cfg-if", - "crunchy", -] - -[[package]] -name = "hashbrown" -version = "0.14.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" - -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - -[[package]] -name = "hermit-abi" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" - -[[package]] -name = "http" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - -[[package]] -name = "http-body" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" -dependencies = [ - "bytes", - "http", -] - -[[package]] -name = "http-body-util" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d" -dependencies = [ - "bytes", - "futures-core", - "http", - "http-body", - "pin-project-lite", -] - -[[package]] -name = "httparse" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" - -[[package]] -name = "hyper" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe575dd17d0862a9a33781c8c4696a55c320909004a67a00fb286ba8b1bc496d" -dependencies = [ - "bytes", - "futures-channel", - "futures-util", - "h2", - "http", - "http-body", - "httparse", - "itoa", - "pin-project-lite", - "smallvec", - "tokio", - "want", -] - -[[package]] -name = "hyper-tls" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" -dependencies = [ - "bytes", - "http-body-util", - "hyper", - "hyper-util", - "native-tls", - "tokio", - "tokio-native-tls", - "tower-service", -] - -[[package]] -name = "hyper-util" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" -dependencies = [ - "bytes", - "futures-channel", - "futures-util", - "http", - "http-body", - "hyper", - "pin-project-lite", - "socket2", - "tokio", - "tower", - "tower-service", - "tracing", -] - -[[package]] -name = "idna" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" -dependencies = [ - "unicode-bidi", - "unicode-normalization", -] - -[[package]] -name = "indexmap" -version = "2.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" -dependencies = [ - "equivalent", - "hashbrown", -] - -[[package]] -name = "indoc" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" - -[[package]] -name = "instant" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "inventory" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f958d3d68f4167080a18141e10381e7634563984a537f2a49a30fd8e53ac5767" - -[[package]] -name = "ipnet" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" - -[[package]] -name = "is-terminal" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" -dependencies = [ - "hermit-abi", - "rustix", - "windows-sys 0.52.0", -] - -[[package]] -name = "itertools" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "1.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" - -[[package]] -name = "jemalloc-ctl" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cffc705424a344c054e135d12ee591402f4539245e8bbd64e6c9eaa9458b63c" -dependencies = [ - "jemalloc-sys", - "libc", - "paste", -] - -[[package]] -name = "jemalloc-sys" -version = "0.5.4+5.3.0-patched" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2" -dependencies = [ - "cc", - "libc", -] - -[[package]] -name = "jemallocator" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc" -dependencies = [ - "jemalloc-sys", - "libc", -] - -[[package]] -name = "js-sys" -version = "0.3.67" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a1d36f1235bc969acba30b7f5990b864423a6068a10f7c90ae8f0112e3a59d1" -dependencies = [ - "wasm-bindgen", -] - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "libc" -version = "0.2.153" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" - -[[package]] -name = "linux-raw-sys" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" - -[[package]] -name = "lock_api" -version = "0.4.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" -dependencies = [ - "autocfg", - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" - -[[package]] -name = "memchr" -version = "2.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" - -[[package]] -name = "memoffset" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" -dependencies = [ - "autocfg", -] - -[[package]] -name = "mime" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" - -[[package]] -name = "miniz_oxide" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" -dependencies = [ - "adler", -] - -[[package]] -name = "mio" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" -dependencies = [ - "libc", - "wasi", - "windows-sys 0.48.0", -] - -[[package]] -name = "native-tls" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" -dependencies = [ - "lazy_static", - "libc", - "log", - "openssl", - "openssl-probe", - "openssl-sys", - "schannel", - "security-framework", - "security-framework-sys", - "tempfile", -] - -[[package]] -name = "num-traits" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" -dependencies = [ - "autocfg", -] - -[[package]] -name = "num_cpus" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "oasysdb" -version = "0.6.1" -dependencies = [ - "bincode", - "criterion", - "dotenv", - "futures", - "jemalloc-ctl", - "jemallocator", - "ordered-float", - "parking_lot 0.12.1", - "pyo3", - "rand", - "rayon", - "reqwest", - "serde", - "serde-big-array", - "serde_json", - "simsimd", - "sled", -] - -[[package]] -name = "object" -version = "0.32.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" -dependencies = [ - "memchr", -] - -[[package]] -name = "once_cell" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" - -[[package]] -name = "oorandom" -version = "11.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" - -[[package]] -name = "openssl" -version = "0.10.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" -dependencies = [ - "bitflags 2.4.2", - "cfg-if", - "foreign-types", - "libc", - "once_cell", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "openssl-probe" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" - -[[package]] -name = "openssl-sys" -version = "0.9.102" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2" -dependencies = [ - "cc", - "libc", - "pkg-config", - "vcpkg", -] - -[[package]] -name = "ordered-float" -version = "4.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e" -dependencies = [ - "num-traits", -] - -[[package]] -name = "parking_lot" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" -dependencies = [ - "instant", - "lock_api", - "parking_lot_core 0.8.6", -] - -[[package]] -name = "parking_lot" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" -dependencies = [ - "lock_api", - "parking_lot_core 0.9.9", -] - -[[package]] -name = "parking_lot_core" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" -dependencies = [ - "cfg-if", - "instant", - "libc", - "redox_syscall 0.2.16", - "smallvec", - "winapi", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall 0.4.1", - "smallvec", - "windows-targets 0.48.5", -] - -[[package]] -name = "paste" -version = "1.0.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" - -[[package]] -name = "percent-encoding" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" - -[[package]] -name = "pin-project" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "pkg-config" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" - -[[package]] -name = "plotters" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" -dependencies = [ - "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "plotters-backend" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" - -[[package]] -name = "plotters-svg" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" -dependencies = [ - "plotters-backend", -] - -[[package]] -name = "portable-atomic" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" - -[[package]] -name = "ppv-lite86" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" - -[[package]] -name = "proc-macro2" -version = "1.0.81" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "pyo3" -version = "0.21.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e00b96a521718e08e03b1a622f01c8a8deb50719335de3f60b3b3950f069d8" -dependencies = [ - "cfg-if", - "indoc", - "inventory", - "libc", - "memoffset", - "parking_lot 0.12.1", - "portable-atomic", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", -] - -[[package]] -name = "pyo3-build-config" -version = "0.21.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7883df5835fafdad87c0d888b266c8ec0f4c9ca48a5bed6bbb592e8dedee1b50" -dependencies = [ - "once_cell", - "target-lexicon", -] - -[[package]] -name = "pyo3-ffi" -version = "0.21.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01be5843dc60b916ab4dad1dca6d20b9b4e6ddc8e15f50c47fe6d85f1fb97403" -dependencies = [ - "libc", - "pyo3-build-config", -] - -[[package]] -name = "pyo3-macros" -version = "0.21.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77b34069fc0682e11b31dbd10321cbf94808394c56fd996796ce45217dfac53c" -dependencies = [ - "proc-macro2", - "pyo3-macros-backend", - "quote", - "syn", -] - -[[package]] -name = "pyo3-macros-backend" -version = "0.21.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08260721f32db5e1a5beae69a55553f56b99bd0e1c3e6e0a5e8851a9d0f5a85c" -dependencies = [ - "heck", - "proc-macro2", - "pyo3-build-config", - "quote", - "syn", -] - -[[package]] -name = "quote" -version = "1.0.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rayon" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" -dependencies = [ - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" -dependencies = [ - "crossbeam-deque", - "crossbeam-utils", -] - -[[package]] -name = "redox_syscall" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" -dependencies = [ - "bitflags 1.3.2", -] - -[[package]] -name = "redox_syscall" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" -dependencies = [ - "bitflags 1.3.2", -] - -[[package]] -name = "regex" -version = "1.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" - -[[package]] -name = "reqwest" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10" -dependencies = [ - "base64", - "bytes", - "encoding_rs", - "futures-channel", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "http-body-util", - "hyper", - "hyper-tls", - "hyper-util", - "ipnet", - "js-sys", - "log", - "mime", - "native-tls", - "once_cell", - "percent-encoding", - "pin-project-lite", - "rustls-pemfile", - "serde", - "serde_json", - "serde_urlencoded", - "sync_wrapper", - "system-configuration", - "tokio", - "tokio-native-tls", - "tower-service", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", - "winreg", -] - -[[package]] -name = "rustc-demangle" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" - -[[package]] -name = "rustix" -version = "0.38.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" -dependencies = [ - "bitflags 2.4.2", - "errno", - "libc", - "linux-raw-sys", - "windows-sys 0.52.0", -] - -[[package]] -name = "rustls-pemfile" -version = "2.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" -dependencies = [ - "base64", - "rustls-pki-types", -] - -[[package]] -name = "rustls-pki-types" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "beb461507cee2c2ff151784c52762cf4d9ff6a61f3e80968600ed24fa837fa54" - -[[package]] -name = "ryu" -version = "1.0.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" - -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "schannel" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" -dependencies = [ - "windows-sys 0.52.0", -] - -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - -[[package]] -name = "security-framework" -version = "2.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "770452e37cad93e0a50d5abc3990d2bc351c36d0328f86cefec2f2fb206eaef6" -dependencies = [ - "bitflags 1.3.2", - "core-foundation", - "core-foundation-sys", - "libc", - "security-framework-sys", -] - -[[package]] -name = "security-framework-sys" -version = "2.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7" -dependencies = [ - "core-foundation-sys", - "libc", -] - -[[package]] -name = "serde" -version = "1.0.198" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9846a40c979031340571da2545a4e5b7c4163bdae79b301d5f86d03979451fcc" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde-big-array" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11fc7cc2c76d73e0f27ee52abbd64eec84d46f370c88371120433196934e4b7f" -dependencies = [ - "serde", -] - -[[package]] -name = "serde_derive" -version = "1.0.198" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e88edab869b01783ba905e7d0153f9fc1a6505a96e4ad3018011eedb838566d9" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.116" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e17db7126d17feb94eb3fad46bf1a96b034e8aacbc2e775fe81505f8b0b2813" -dependencies = [ - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "serde_urlencoded" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" -dependencies = [ - "form_urlencoded", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "simsimd" -version = "4.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9b7427cabeed25b18b43cc7d7ec466d8d1953a13ed56c46dc414c99ca4754e" -dependencies = [ - "cc", -] - -[[package]] -name = "slab" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] - -[[package]] -name = "sled" -version = "0.34.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935" -dependencies = [ - "crc32fast", - "crossbeam-epoch", - "crossbeam-utils", - "fs2", - "fxhash", - "libc", - "log", - "parking_lot 0.11.2", -] - -[[package]] -name = "smallvec" -version = "1.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" - -[[package]] -name = "socket2" -version = "0.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "syn" -version = "2.0.60" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "sync_wrapper" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" - -[[package]] -name = "system-configuration" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" -dependencies = [ - "bitflags 1.3.2", - "core-foundation", - "system-configuration-sys", -] - -[[package]] -name = "system-configuration-sys" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" -dependencies = [ - "core-foundation-sys", - "libc", -] - -[[package]] -name = "target-lexicon" -version = "0.12.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69758bda2e78f098e4ccb393021a0963bb3442eac05f135c30f61b7370bbafae" - -[[package]] -name = "tempfile" -version = "3.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" -dependencies = [ - "cfg-if", - "fastrand", - "rustix", - "windows-sys 0.52.0", -] - -[[package]] -name = "tinytemplate" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" -dependencies = [ - "serde", - "serde_json", -] - -[[package]] -name = "tinyvec" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" - -[[package]] -name = "tokio" -version = "1.37.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" -dependencies = [ - "backtrace", - "bytes", - "libc", - "mio", - "num_cpus", - "pin-project-lite", - "socket2", - "windows-sys 0.48.0", -] - -[[package]] -name = "tokio-native-tls" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" -dependencies = [ - "native-tls", - "tokio", -] - -[[package]] -name = "tokio-util" -version = "0.7.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "pin-project", - "pin-project-lite", - "tokio", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tower-layer" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" - -[[package]] -name = "tower-service" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" - -[[package]] -name = "tracing" -version = "0.1.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" -dependencies = [ - "log", - "pin-project-lite", - "tracing-core", -] - -[[package]] -name = "tracing-core" -version = "0.1.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" -dependencies = [ - "once_cell", -] - -[[package]] -name = "try-lock" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" - -[[package]] -name = "unicode-bidi" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" - -[[package]] -name = "unicode-ident" -version = "1.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" - -[[package]] -name = "unicode-normalization" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" -dependencies = [ - "tinyvec", -] - -[[package]] -name = "unindent" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" - -[[package]] -name = "url" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" -dependencies = [ - "form_urlencoded", - "idna", - "percent-encoding", -] - -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - -[[package]] -name = "walkdir" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee" -dependencies = [ - "same-file", - "winapi-util", -] - -[[package]] -name = "want" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" -dependencies = [ - "try-lock", -] - -[[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" - -[[package]] -name = "wasm-bindgen" -version = "0.2.90" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1223296a201415c7fad14792dbefaace9bd52b62d33453ade1c5b5f07555406" -dependencies = [ - "cfg-if", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.90" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcdc935b63408d58a32f8cc9738a0bffd8f05cc7c002086c6ef20b7312ad9dcd" -dependencies = [ - "bumpalo", - "log", - "once_cell", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-futures" -version = "0.4.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bde2032aeb86bdfaecc8b261eef3cba735cc426c1f3a3416d1e0791be95fc461" -dependencies = [ - "cfg-if", - "js-sys", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.90" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e4c238561b2d428924c49815533a8b9121c664599558a5d9ec51f8a1740a999" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.90" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bae1abb6806dc1ad9e560ed242107c0f6c84335f1749dd4e8ddb012ebd5e25a7" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.90" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d91413b1c31d7539ba5ef2451af3f0b833a005eb27a631cec32bc0635a8602b" - -[[package]] -name = "web-sys" -version = "0.3.67" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" -dependencies = [ - "winapi", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "windows-sys" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" -dependencies = [ - "windows-targets 0.48.5", -] - -[[package]] -name = "windows-sys" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" -dependencies = [ - "windows-targets 0.52.0", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", -] - -[[package]] -name = "windows-targets" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" -dependencies = [ - "windows_aarch64_gnullvm 0.52.0", - "windows_aarch64_msvc 0.52.0", - "windows_i686_gnu 0.52.0", - "windows_i686_msvc 0.52.0", - "windows_x86_64_gnu 0.52.0", - "windows_x86_64_gnullvm 0.52.0", - "windows_x86_64_msvc 0.52.0", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" - -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - -[[package]] -name = "windows_i686_gnu" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" - -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - -[[package]] -name = "windows_i686_msvc" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" - -[[package]] -name = "winreg" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] +version = "0.7.0" diff --git a/Cargo.toml b/Cargo.toml index b8faf43e..cf2d0171 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,77 +1,25 @@ [package] name = "oasysdb" -version = "0.6.1" +version = "0.7.0" edition = "2021" license = "Apache-2.0" readme = "readme.md" # Information. authors = ["Edwin Kys", "Oasys"] -description = "Fast & flexible embedded vector database with incremental HNSW indexing." +description = "Fast & scalable vector store with ground-up hybrid ANN search support." homepage = "https://docs.oasysdb.com" repository = "https://github.com/oasysai/oasysdb" # Metadata. -keywords = ["embedded", "vector", "database", "hnsw", "ann"] -categories = ["database", "algorithms", "embedded"] - -# For PyO3 and Maturin. -[lib] -name = "oasysdb" -crate-type = ["lib", "cdylib"] +keywords = ["vector", "database", "anns", "search", "simd"] +categories = ["database", "algorithms", "data-structures"] [dependencies] -sled = "0.34.7" - -# Utilities. -rayon = "1.8.0" -parking_lot = "0.12.1" -ordered-float = "4.2.0" -rand = "0.8.5" -simsimd = "4.3.1" - -# Serialization. -serde = { version = "1.0.193", features = ["derive"] } -serde-big-array = "0.5.1" -bincode = "1.3.3" - -# Interoperability. -serde_json = { version = "1.0.116", optional = true } - -# Other optionals. -[dependencies.reqwest] -version = "0.12.4" -features = ["blocking", "json"] -optional = true - -# Python bindings tool. -[dependencies.pyo3] -version = "0.21.2" -features = ["experimental-async", "gil-refs", "multiple-pymethods"] -optional = true [dev-dependencies] -criterion = { version = "0.5.1", features = ["html_reports"] } -dotenv = "0.15.0" - -# Memory management. -jemallocator = "0.5.4" -jemalloc-ctl = "0.5.4" - -# Async handling. -futures = "0.3.30" - -[features] -gen = ["dep:reqwest", "json"] -json = ["dep:serde_json"] -py = ["dep:pyo3"] [profile.release] lto = true opt-level = "z" codegen-units = 1 - -[[bench]] -name = "benchmark" -path = "bench/main.rs" -harness = false From f36564f64fddf54ee55887ce0384745a0bffeecc Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Fri, 21 Jun 2024 18:42:03 -0500 Subject: [PATCH 03/88] fix: remove missing docs warning --- src/main.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 57906394..7ae1c301 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,3 @@ -#![warn(missing_docs)] #![warn(unused_qualifications)] fn main() {} From 092afe5587842dbc465b7a73313284bcec6acc7a Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Fri, 21 Jun 2024 18:59:23 -0500 Subject: [PATCH 04/88] feat: update contributing docs --- docs/contributing.md | 45 +++++++------------------------------------- 1 file changed, 7 insertions(+), 38 deletions(-) diff --git a/docs/contributing.md b/docs/contributing.md index ff5f518a..9261bb0e 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -31,48 +31,19 @@ We try to prioritize features and bug fixes that are on our roadmap or requested For features, we try to prioritize features that are backed by real-world use cases. If you have a use case for a feature, please include it in the issue. We'd love to hear about it! -# Getting started +## Getting started -Getting started with OasysDB development is easy. +Getting started with OasysDB development is pretty straightforward. -You will need to have Rust installed. We recommend using [rustup](https://www.rust-lang.org/tools/install) to install Rust. We also recommend having rust-analyzer installed for your code editor. +First, you will need to have Rust installed on your machine. We recommend using [rustup](https://www.rust-lang.org/tools/install) to install Rust. We also recommend having rust-analyzer installed for your code editor for a better development experience. -After that, you need to install Maturin, which is a Python library used by OasysDB for building and publishing its Python packages. You can install Maturin using the following command: - -```bash -pip install maturin -``` - -After setting up Maturin, fork the repository and clone it to your local machine. Then, in the root directory of the project, you need to set up and activate Python virtual environment for the project with `requirements.txt` as the dependency. - -Depending on the features you want to work on, you may need to create a `.env` file in the root directory of the repository. The `.env` file should contain the variables listed in the `.env.example` file with the appropriate values. - -Once everything is set, you can run the following commands in the root directory of the repository: - -```bash -# Run Rust tests, add feature flags as needed. -cargo test - -# Install OasysDB as a Python package. -maturin dev - -# Run Python tests. -pytest -``` - -These commands will run the tests to make sure that everything is working as expected before you start working on your changes. - -```bash -cargo bench -``` - -This command will run the benchmarks to measure the performance of the vector database. This is useful to make sure that your changes don't introduce any significant performance regressions. +TODO: Complete the getting started guide. ## Style guide -We use mostly the default linting and style guide for Rust except for some linting changes listed in rustfmt.toml file. For more information, see the [Rust Style Guide](https://doc.rust-lang.org/beta/style-guide/index.html). +We mostly use the default linting and style guide for Rust except for some linting changes listed in rustfmt.toml file. For more information about the code style, see the [Rust Style Guide](https://doc.rust-lang.org/beta/style-guide/index.html). -For commit messages, we use the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) format. This allows us to maintain consistency and readability in our commit messages. +For commit messages, we use the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) format. This allows us to maintain consistency and readability in our Git commit history. When commenting your code, please try your best to write comments that are clear and concise with proper English sentence capitalization and punctuation. This will help us and the community understand your code better and keep the codebase maintainable. @@ -80,7 +51,7 @@ When commenting your code, please try your best to write comments that are clear Once you have made your changes, you can submit a pull request. We will review your pull request and provide feedback. If your pull request is accepted, we will merge it into the main branch. -For organization purposes, we ask that you use the following format for your pull request title in lowercase: +For organization purposes, we ask that you use the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) format for your pull request title in lowercase: ``` : @@ -93,8 +64,6 @@ feat: add support ... fix: fix issue ... ``` -This is similar to the format used in [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/). - ## Conclusion Thank you for taking the time to read this documentation. We look forward to your contributions! Another way to support this project is to star this project, share it with your circles, and join us on [Discord](https://discord.gg/bDhQrkqNP4). From eeddd7739cf9f499ca12770f16a25dd12463417a Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 22 Jun 2024 08:42:20 -0500 Subject: [PATCH 05/88] feat: add custom error type and collection --- Cargo.lock | 928 +++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/db/collection.rs | 19 + src/db/mod.rs | 3 + src/main.rs | 3 + src/types/error.rs | 49 +++ src/types/mod.rs | 1 + 7 files changed, 1004 insertions(+) create mode 100644 src/db/collection.rs create mode 100644 src/db/mod.rs create mode 100644 src/types/error.rs create mode 100644 src/types/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 7cad23ed..bfbbae2e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,934 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "const-random", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "arrow" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ae9728f104939be6d8d9b368a354b4929b0569160ea1641f0721b55a861ce38" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7029a5b3efbeafbf4a12d12dc16b8f9e9bff20a410b8c25c5d28acc089e1043" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "num", +] + +[[package]] +name = "arrow-array" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d33238427c60271710695f17742f45b1a5dc5bcfc5c15331c25ddfe7abf70d97" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe9b95e825ae838efaf77e366c00d3fc8cca78134c9db497d6bda425f2e7b7c1" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cf8385a9d5b5fcde771661dd07652b79b9139fea66193eda6a88664400ccab" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cea5068bef430a86690059665e40034625ec323ffa4dd21972048eebb0127adc" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "lazy_static", + "lexical-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb29be98f987bcf217b070512bb7afba2f65180858bca462edf4a39d84a23e10" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffc68f6523970aa6f7ce1dc9a33a7d9284cfb9af77d4ad3e617dbe5d79cc6ec8" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2041380f94bd6437ab648e6c2085a045e45a0c44f91a1b9a4fe3fed3d379bfb1" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap", + "lexical-core", + "num", + "serde", + "serde_json", +] + +[[package]] +name = "arrow-ord" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb56ed1547004e12203652f12fe12e824161ff9d1e5cf2a7dc4ff02ba94f413" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "half", + "num", +] + +[[package]] +name = "arrow-row" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "575b42f1fc588f2da6977b94a5ca565459f5ab07b60545e17243fb9a7ed6d43e" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", + "hashbrown", +] + +[[package]] +name = "arrow-schema" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32aae6a60458a2389c0da89c9de0b7932427776127da1a738e2efc21d32f3393" + +[[package]] +name = "arrow-select" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de36abaef8767b4220d7b4a8c2fe5ffc78b47db81b03d77e2136091c3ba39102" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "52.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e435ada8409bcafc910bc3e0077f532a4daa20e99060a496685c0e3e53cc2597" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "bytes" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" + +[[package]] +name = "cc" +version = "1.0.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "num-traits", + "windows-targets", +] + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "flatbuffers" +version = "24.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" +dependencies = [ + "bitflags", + "rustc_version", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "iana-time-zone" +version = "0.1.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "indexmap" +version = "2.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lexical-core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + +[[package]] +name = "log" +version = "0.4.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c165a9ab64cf766f73521c0dd2cfdff64f488b8f0b3e621face3462d3db536d7" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + [[package]] name = "oasysdb" version = "0.7.0" +dependencies = [ + "arrow", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "semver" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" + +[[package]] +name = "serde" +version = "1.0.203" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.203" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "syn" +version = "2.0.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff8655ed1d86f3af4ee3fd3263786bc14245ad17c4c7e85ba7187fb3ae028c90" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" + +[[package]] +name = "zerocopy" +version = "0.7.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae87e3fcd617500e5d106f0380cf7b77f3c6092aae37191433159dda23cfb087" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index cf2d0171..633c472b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ keywords = ["vector", "database", "anns", "search", "simd"] categories = ["database", "algorithms", "data-structures"] [dependencies] +arrow = "52.0.0" [dev-dependencies] diff --git a/src/db/collection.rs b/src/db/collection.rs new file mode 100644 index 00000000..ae2a974f --- /dev/null +++ b/src/db/collection.rs @@ -0,0 +1,19 @@ +use super::*; +use arrow::array::RecordBatch; +use arrow::datatypes::{Field, Schema}; +use std::sync::{Arc, RwLock}; + +pub type ArcLock = Arc>; + +pub struct Collection { + schema: ArcLock, + data: ArcLock>, +} + +impl Collection { + pub fn new() -> Self { + let schema = Arc::new(RwLock::new(Schema::empty())); + let data = Arc::new(RwLock::new(vec![])); + Self { schema, data } + } +} diff --git a/src/db/mod.rs b/src/db/mod.rs new file mode 100644 index 00000000..4e205cca --- /dev/null +++ b/src/db/mod.rs @@ -0,0 +1,3 @@ +use crate::types::error::{Error, ErrorCode}; + +mod collection; diff --git a/src/main.rs b/src/main.rs index 7ae1c301..6f61b5c7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,6 @@ #![warn(unused_qualifications)] +mod db; +mod types; + fn main() {} diff --git a/src/types/error.rs b/src/types/error.rs new file mode 100644 index 00000000..b25ad837 --- /dev/null +++ b/src/types/error.rs @@ -0,0 +1,49 @@ +use std::fmt::{Display, Formatter, Result}; + +// Other error types. +use std::error::Error as StandardError; +use std::sync::PoisonError; + +#[derive(Debug, PartialEq, Eq)] +pub enum ErrorCode { + StandardError, + ConcurrencyError, +} + +#[derive(Debug)] +pub struct Error { + pub code: ErrorCode, + pub message: String, +} + +impl Error { + pub fn new(code: &ErrorCode, message: &str) -> Self { + Self { code: *code, message: message.to_string() } + } +} + +impl Display for Error { + fn fmt(&self, f: &mut Formatter) -> Result { + let code = &self.code; + let message = &self.message; + write!(f, "{code:?}: {message}") + } +} + +// Implement other interoperability to other error types. + +impl StandardError for Error {} + +impl From> for Error { + fn from(err: Box) -> Self { + let code = ErrorCode::StandardError; + Error::new(&code, &err.to_string()) + } +} + +impl From> for Error { + fn from(err: PoisonError) -> Self { + let code = ErrorCode::ConcurrencyError; + Error::new(&code, &err.to_string()) + } +} diff --git a/src/types/mod.rs b/src/types/mod.rs new file mode 100644 index 00000000..a91e7351 --- /dev/null +++ b/src/types/mod.rs @@ -0,0 +1 @@ +pub mod error; From 02ef480b0bf106976d2c8dc475b9580de159c2c4 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 22 Jun 2024 14:03:01 -0500 Subject: [PATCH 06/88] build: add rayon --- Cargo.lock | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + 2 files changed, 53 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index bfbbae2e..2765b700 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -344,6 +344,31 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + [[package]] name = "crunchy" version = "0.2.2" @@ -371,6 +396,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "either" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" + [[package]] name = "equivalent" version = "1.0.1" @@ -636,6 +667,7 @@ name = "oasysdb" version = "0.7.0" dependencies = [ "arrow", + "rayon", ] [[package]] @@ -662,6 +694,26 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "regex" version = "1.10.5" diff --git a/Cargo.toml b/Cargo.toml index 633c472b..e5522bb3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ categories = ["database", "algorithms", "data-structures"] [dependencies] arrow = "52.0.0" +rayon = "1.10.0" [dev-dependencies] From dc35fe1ba7a61c4b55da855b86d1e23e614cb3c6 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 22 Jun 2024 14:06:29 -0500 Subject: [PATCH 07/88] feat: add arrow error compat --- src/types/error.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/types/error.rs b/src/types/error.rs index b25ad837..efeede95 100644 --- a/src/types/error.rs +++ b/src/types/error.rs @@ -1,13 +1,15 @@ use std::fmt::{Display, Formatter, Result}; // Other error types. +use arrow::error::ArrowError; use std::error::Error as StandardError; use std::sync::PoisonError; -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ErrorCode { - StandardError, - ConcurrencyError, + Standard, + Concurrency, + Arrow, } #[derive(Debug)] @@ -36,14 +38,21 @@ impl StandardError for Error {} impl From> for Error { fn from(err: Box) -> Self { - let code = ErrorCode::StandardError; + let code = ErrorCode::Standard; Error::new(&code, &err.to_string()) } } impl From> for Error { fn from(err: PoisonError) -> Self { - let code = ErrorCode::ConcurrencyError; + let code = ErrorCode::Concurrency; + Error::new(&code, &err.to_string()) + } +} + +impl From for Error { + fn from(err: ArrowError) -> Self { + let code = ErrorCode::Arrow; Error::new(&code, &err.to_string()) } } From 0d1a01499f67eb4cdd54ee68db4017b4f8ac84e4 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 22 Jun 2024 14:26:34 -0500 Subject: [PATCH 08/88] ci: improve ci/cd pipeline --- .github/workflows/publish-docs.yml | 4 +- .github/workflows/server-ci.yml | 61 ++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/server-ci.yml diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml index 5ce54b08..5c0b32ae 100644 --- a/.github/workflows/publish-docs.yml +++ b/.github/workflows/publish-docs.yml @@ -1,4 +1,4 @@ -name: Publish documentation website +name: Publish documentation on: workflow_dispatch: @@ -41,7 +41,7 @@ jobs: runs-on: ubuntu-latest needs: build-docs environment: - name: documentation + name: Docs url: ${{ steps.deployment.outputs.page_url }} steps: - name: Checkout diff --git a/.github/workflows/server-ci.yml b/.github/workflows/server-ci.yml new file mode 100644 index 00000000..910a7856 --- /dev/null +++ b/.github/workflows/server-ci.yml @@ -0,0 +1,61 @@ +name: CI checks for the server + +on: + workflow_dispatch: + + pull_request: + paths-ignore: + - "docs/**" + - "clients/**" + + push: + branches: + - main + paths-ignore: + - "docs/**" + - "clients/**" + +jobs: + rustfmt-format: + name: Check code formatting + runs-on: ubuntu-latest + steps: + - name: Checkout the code + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt + + - name: Run cargo fmt with check + run: cargo fmt -- --check + + clippy-lint: + name: Lint code with Clippy + runs-on: ubuntu-latest + steps: + - name: Checkout the code + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + components: clippy + + - name: Run cargo clippy + run: cargo clippy -- -D warnings + + run-rust-tests: + name: Run Rust tests + needs: clippy-lint + runs-on: ubuntu-latest + steps: + - name: Checkout the code + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Run cargo test + run: cargo test --all-features -- --test-threads 1 From 1a5ce4b87a622a651f7864328b0f9e6fe1a63051 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 22 Jun 2024 14:28:24 -0500 Subject: [PATCH 09/88] feat: add initial collection test --- src/db/collection.rs | 52 +++++++++++++++++++++++++++++------- src/db/mod.rs | 5 ++-- src/main.rs | 5 ++++ src/tests/mod.rs | 3 +++ src/tests/test_collection.rs | 22 +++++++++++++++ 5 files changed, 76 insertions(+), 11 deletions(-) create mode 100644 src/tests/mod.rs create mode 100644 src/tests/test_collection.rs diff --git a/src/db/collection.rs b/src/db/collection.rs index ae2a974f..92001f26 100644 --- a/src/db/collection.rs +++ b/src/db/collection.rs @@ -1,19 +1,53 @@ use super::*; use arrow::array::RecordBatch; -use arrow::datatypes::{Field, Schema}; -use std::sync::{Arc, RwLock}; - -pub type ArcLock = Arc>; +use arrow::datatypes::{Fields, Schema}; +use std::sync::{Arc, RwLock as Lock}; pub struct Collection { - schema: ArcLock, - data: ArcLock>, + schema: Lock, + data: Lock>, + count: Lock, } impl Collection { pub fn new() -> Self { - let schema = Arc::new(RwLock::new(Schema::empty())); - let data = Arc::new(RwLock::new(vec![])); - Self { schema, data } + let schema = Lock::new(Schema::empty()); + let data = Lock::new(vec![]); + let count = Lock::new(0); + Self { schema, data, count } + } + + pub fn add_fields(&self, fields: impl Into) -> Result<(), Error> { + // Create a new schema with the new field. + let mut schema = self.schema.write()?; + let schemas = vec![schema.clone(), Schema::new(fields)]; + let new_schema = Schema::try_merge(schemas)?; + + // Migrate the data to the new schema. + let migrate_data = |batch: &RecordBatch| { + let schema = Arc::new(new_schema.clone()); + + // We can unwrap here because the new schema is guaranted + // to be a superset of the old schema. + batch.clone().with_schema(schema).unwrap() + }; + + let mut data = self.data.write()?; + let migrated_data = data.par_iter().map(migrate_data).collect(); + + // Update the schema and data. + *schema = new_schema; + *data = migrated_data; + + Ok(()) + } + + pub fn count(&self) -> usize { + *self.count.read().unwrap() + } + + pub fn schema(&self) -> Result { + let schema = self.schema.read()?; + Ok(schema.clone()) } } diff --git a/src/db/mod.rs b/src/db/mod.rs index 4e205cca..ed37b173 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,3 +1,4 @@ -use crate::types::error::{Error, ErrorCode}; +use crate::types::error::Error; +use rayon::prelude::*; -mod collection; +pub mod collection; diff --git a/src/main.rs b/src/main.rs index 6f61b5c7..06fc07ca 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,11 @@ #![warn(unused_qualifications)] +// TODO: Remove this line when the code is ready +#![allow(dead_code)] mod db; mod types; +#[cfg(test)] +mod tests; + fn main() {} diff --git a/src/tests/mod.rs b/src/tests/mod.rs new file mode 100644 index 00000000..01d0a8bd --- /dev/null +++ b/src/tests/mod.rs @@ -0,0 +1,3 @@ +use crate::types::error::Error; + +mod test_collection; diff --git a/src/tests/test_collection.rs b/src/tests/test_collection.rs new file mode 100644 index 00000000..681533c0 --- /dev/null +++ b/src/tests/test_collection.rs @@ -0,0 +1,22 @@ +use super::*; +use crate::db::collection::Collection; +use arrow::datatypes::{DataType, Field}; + +#[test] +fn test_collection_new() { + let collection = Collection::new(); + assert_eq!(collection.count(), 0); +} + +#[test] +fn test_collection_add_field() -> Result<(), Error> { + let collection = Collection::new(); + let field = Field::new("id", DataType::Utf8, false); + collection.add_fields(vec![field])?; + + let schema = collection.schema()?; + assert_eq!(schema.fields().len(), 1); + assert_eq!(schema.field(0).name(), "id"); + + Ok(()) +} From 2929489680cbdfe42cf8de3fbd3b8f268f279d54 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 22 Jun 2024 14:44:07 -0500 Subject: [PATCH 10/88] feat: add third-party lib resources in contributing --- docs/contributing.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/contributing.md b/docs/contributing.md index 9261bb0e..bfbe9e79 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -37,6 +37,11 @@ Getting started with OasysDB development is pretty straightforward. First, you will need to have Rust installed on your machine. We recommend using [rustup](https://www.rust-lang.org/tools/install) to install Rust. We also recommend having rust-analyzer installed for your code editor for a better development experience. +OasysDB utilizes many third-party crates to provide its functionality. These are some of the most important ones and the resources you can use to learn more about them: + +- [**Apache Arrow**](https://arrow.apache.org): Arrow is a cross-language development platform for in-memory columnar data format for efficient analytic operations. +- [**Rayon**](https://github.com/rayon-rs/rayon): Rayon is a data parallelism library for Rust that provides a simple and efficient API for parallelizing computation. + TODO: Complete the getting started guide. ## Style guide From 32c9f83a5b511c62246bd6b623bf84392d2a28b0 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 22 Jun 2024 15:03:59 -0500 Subject: [PATCH 11/88] feat: add tonic rpc framework --- Cargo.lock | 886 ++++++++++++++++++++++++++++++++++++++++++- Cargo.toml | 11 + build.rs | 4 + docs/contributing.md | 1 + protos/oasysdb.proto | 2 + 5 files changed, 889 insertions(+), 15 deletions(-) create mode 100644 build.rs create mode 100644 protos/oasysdb.proto diff --git a/Cargo.lock b/Cargo.lock index 2765b700..48f395d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,21 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + [[package]] name = "ahash" version = "0.8.11" @@ -40,6 +55,12 @@ dependencies = [ "libc", ] +[[package]] +name = "anyhow" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" + [[package]] name = "arrow" version = "52.0.0" @@ -88,7 +109,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "hashbrown", + "hashbrown 0.14.5", "num", ] @@ -115,7 +136,7 @@ dependencies = [ "arrow-schema", "arrow-select", "atoi", - "base64", + "base64 0.22.1", "chrono", "half", "lexical-core", @@ -181,7 +202,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap", + "indexmap 2.2.6", "lexical-core", "num", "serde", @@ -215,7 +236,7 @@ dependencies = [ "arrow-data", "arrow-schema", "half", - "hashbrown", + "hashbrown 0.14.5", ] [[package]] @@ -255,6 +276,39 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "async-stream" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-trait" +version = "0.1.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "atoi" version = "2.0.0" @@ -270,6 +324,72 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +[[package]] +name = "axum" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf" +dependencies = [ + "async-trait", + "axum-core", + "bitflags 1.3.2", + "bytes", + "futures-util", + "http", + "http-body", + "hyper", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "mime", + "rustversion", + "tower-layer", + "tower-service", +] + +[[package]] +name = "backtrace" +version = "0.3.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + [[package]] name = "base64" version = "0.22.1" @@ -282,6 +402,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" + [[package]] name = "bumpalo" version = "3.16.0" @@ -315,7 +441,7 @@ dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", - "windows-targets", + "windows-targets 0.52.5", ] [[package]] @@ -408,16 +534,83 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "fastrand" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + [[package]] name = "flatbuffers" version = "24.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" dependencies = [ - "bitflags", + "bitflags 1.3.2", "rustc_version", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", +] + +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-sink" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" + +[[package]] +name = "futures-task" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-util" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +dependencies = [ + "futures-core", + "futures-task", + "pin-project-lite", + "pin-utils", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -429,6 +622,31 @@ dependencies = [ "wasi", ] +[[package]] +name = "gimli" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" + +[[package]] +name = "h2" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap 2.2.6", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "half" version = "2.4.1" @@ -440,12 +658,100 @@ dependencies = [ "num-traits", ] +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + [[package]] name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "0.14.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f361cde2f109281a220d4307746cdfd5ee3f410da58a70377762396775634b33" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-timeout" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" +dependencies = [ + "hyper", + "pin-project-lite", + "tokio", + "tokio-io-timeout", +] + [[package]] name = "iana-time-zone" version = "0.1.60" @@ -469,6 +775,16 @@ dependencies = [ "cc", ] +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + [[package]] name = "indexmap" version = "2.2.6" @@ -476,7 +792,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.14.5", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", ] [[package]] @@ -576,18 +901,62 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + [[package]] name = "log" version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + [[package]] name = "memchr" version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "miniz_oxide" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.48.0", +] + +[[package]] +name = "multimap" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" + [[package]] name = "num" version = "0.4.3" @@ -662,12 +1031,35 @@ dependencies = [ "libm", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "oasysdb" version = "0.7.0" dependencies = [ "arrow", + "prost", "rayon", + "tokio", + "tonic", + "tonic-build", +] + +[[package]] +name = "object" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434" +dependencies = [ + "memchr", ] [[package]] @@ -676,6 +1068,70 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap 2.2.6", +] + +[[package]] +name = "pin-project" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "prettyplease" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro2" version = "1.0.86" @@ -685,6 +1141,59 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prost" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" +dependencies = [ + "bytes", + "heck", + "itertools", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" +dependencies = [ + "prost", +] + [[package]] name = "quote" version = "1.0.36" @@ -694,6 +1203,36 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + [[package]] name = "rayon" version = "1.10.0" @@ -743,6 +1282,12 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + [[package]] name = "rustc_version" version = "0.4.0" @@ -752,6 +1297,25 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags 2.5.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustversion" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" + [[package]] name = "ryu" version = "1.0.18" @@ -795,6 +1359,25 @@ dependencies = [ "serde", ] +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -812,6 +1395,24 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + +[[package]] +name = "tempfile" +version = "3.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +dependencies = [ + "cfg-if", + "fastrand", + "rustix", + "windows-sys 0.52.0", +] + [[package]] name = "tiny-keccak" version = "2.0.2" @@ -821,6 +1422,177 @@ dependencies = [ "crunchy", ] +[[package]] +name = "tokio" +version = "1.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "num_cpus", + "pin-project-lite", + "socket2", + "tokio-macros", + "windows-sys 0.48.0", +] + +[[package]] +name = "tokio-io-timeout" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" +dependencies = [ + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-macros" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-stream" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tonic" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76c4eb7a4e9ef9d4763600161f12f5070b92a578e1b634db88a6887844c91a13" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64 0.21.7", + "bytes", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-build" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4ef6dd70a610078cb4e338a0f79d06bc759ff1b22d2120c2ff02ae264ba9c2" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build", + "quote", + "syn", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "indexmap 1.9.3", + "pin-project", + "pin-project-lite", + "rand", + "slab", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-layer" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" + +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + [[package]] name = "unicode-ident" version = "1.0.12" @@ -833,6 +1605,15 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -899,7 +1680,40 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets", + "windows-targets 0.52.5", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.5", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] @@ -908,28 +1722,46 @@ version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", + "windows_aarch64_gnullvm 0.52.5", + "windows_aarch64_msvc 0.52.5", + "windows_i686_gnu 0.52.5", "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_i686_msvc 0.52.5", + "windows_x86_64_gnu 0.52.5", + "windows_x86_64_gnullvm 0.52.5", + "windows_x86_64_msvc 0.52.5", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.5" @@ -942,24 +1774,48 @@ version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.5" diff --git a/Cargo.toml b/Cargo.toml index e5522bb3..70a73ef3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,10 +15,21 @@ repository = "https://github.com/oasysai/oasysdb" keywords = ["vector", "database", "anns", "search", "simd"] categories = ["database", "algorithms", "data-structures"] +[dependencies.tokio] +version = "1.38.0" +features = ["macros", "rt-multi-thread"] + [dependencies] arrow = "52.0.0" rayon = "1.10.0" +# gRPC stuff. +prost = "0.12.6" +tonic = "0.11.0" + +[build-dependencies] +tonic-build = "0.11.0" + [dev-dependencies] [profile.release] diff --git a/build.rs b/build.rs new file mode 100644 index 00000000..559a2de0 --- /dev/null +++ b/build.rs @@ -0,0 +1,4 @@ +fn main() -> Result<(), Box> { + tonic_build::compile_protos("protos/oasysdb.proto")?; + Ok(()) +} diff --git a/docs/contributing.md b/docs/contributing.md index bfbe9e79..8c16ec57 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -41,6 +41,7 @@ OasysDB utilizes many third-party crates to provide its functionality. These are - [**Apache Arrow**](https://arrow.apache.org): Arrow is a cross-language development platform for in-memory columnar data format for efficient analytic operations. - [**Rayon**](https://github.com/rayon-rs/rayon): Rayon is a data parallelism library for Rust that provides a simple and efficient API for parallelizing computation. +- [**Tonic**](https://github.com/hyperium/tonic): Tonic is a gRPC over HTTP/2 implementation focused on high performance and flexibility built on top of the Tokio asynchronous runtime. TODO: Complete the getting started guide. diff --git a/protos/oasysdb.proto b/protos/oasysdb.proto new file mode 100644 index 00000000..ac1c1cfa --- /dev/null +++ b/protos/oasysdb.proto @@ -0,0 +1,2 @@ +syntax = "proto3"; +package oasysdb; From 5db179b41419232e7d8679aedfd821c46b511de2 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 22 Jun 2024 17:22:54 -0500 Subject: [PATCH 12/88] feat(wip): add database grpc --- build.rs | 2 +- protos/database.proto | 13 +++++++++++++ protos/oasysdb.proto | 2 -- src/db/database.rs | 15 +++++++++++++++ src/db/mod.rs | 2 ++ src/main.rs | 22 ++++++++++++++++++++-- src/proto.rs | 1 + 7 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 protos/database.proto delete mode 100644 protos/oasysdb.proto create mode 100644 src/db/database.rs create mode 100644 src/proto.rs diff --git a/build.rs b/build.rs index 559a2de0..5e0f7ef3 100644 --- a/build.rs +++ b/build.rs @@ -1,4 +1,4 @@ fn main() -> Result<(), Box> { - tonic_build::compile_protos("protos/oasysdb.proto")?; + tonic_build::compile_protos("protos/database.proto")?; Ok(()) } diff --git a/protos/database.proto b/protos/database.proto new file mode 100644 index 00000000..1fdfe312 --- /dev/null +++ b/protos/database.proto @@ -0,0 +1,13 @@ +syntax = "proto3"; +package database; + +import "google/protobuf/empty.proto"; + +service Database { + rpc CreateCollection(CreateCollectionRequest) returns (google.protobuf.Empty); +} + +// region CreateCollection + message CreateCollectionRequest { + string name = 1; + } diff --git a/protos/oasysdb.proto b/protos/oasysdb.proto deleted file mode 100644 index ac1c1cfa..00000000 --- a/protos/oasysdb.proto +++ /dev/null @@ -1,2 +0,0 @@ -syntax = "proto3"; -package oasysdb; diff --git a/src/db/database.rs b/src/db/database.rs new file mode 100644 index 00000000..c3338a9e --- /dev/null +++ b/src/db/database.rs @@ -0,0 +1,15 @@ +use super::*; +use crate::proto::database_server::Database as ProtoDatabase; +use crate::proto::CreateCollectionRequest; + +pub struct Database {} + +#[tonic::async_trait] +impl ProtoDatabase for Database { + async fn create_collection( + &self, + request: Request, + ) -> Result, Status> { + unimplemented!(); + } +} diff --git a/src/db/mod.rs b/src/db/mod.rs index ed37b173..99e38d5f 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,4 +1,6 @@ use crate::types::error::Error; use rayon::prelude::*; +use tonic::{Request, Response, Status}; pub mod collection; +pub mod database; diff --git a/src/main.rs b/src/main.rs index 06fc07ca..d9a98b11 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,29 @@ -#![warn(unused_qualifications)] // TODO: Remove this line when the code is ready #![allow(dead_code)] mod db; +mod proto; mod types; #[cfg(test)] mod tests; -fn main() {} +use db::database::Database; +use proto::database_server::DatabaseServer; +use tonic::transport::Server; + +const HOST: &str = "0.0.0.0"; +const PORT: u16 = 2525; + +#[tokio::main] +async fn main() -> Result<(), Box> { + let addr = format!("{HOST}:{PORT}").parse()?; + let database = Database {}; + + Server::builder() + .add_service(DatabaseServer::new(database)) + .serve(addr) + .await?; + + Ok(()) +} diff --git a/src/proto.rs b/src/proto.rs new file mode 100644 index 00000000..3b00fad1 --- /dev/null +++ b/src/proto.rs @@ -0,0 +1 @@ +tonic::include_proto!("database"); From 2deba851edf8accccc7a339d51d5c5d7adbd7b9b Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 22 Jun 2024 23:48:00 -0500 Subject: [PATCH 13/88] ci: install protobuf compiler in steps --- .github/workflows/server-ci.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/server-ci.yml b/.github/workflows/server-ci.yml index 910a7856..b1b84cae 100644 --- a/.github/workflows/server-ci.yml +++ b/.github/workflows/server-ci.yml @@ -43,6 +43,11 @@ jobs: with: components: clippy + - name: Install Protobuf compiler + run: | + sudo apt update && sudo apt upgrade -y + sudo apt install -y protobuf-compiler libprotobuf-dev + - name: Run cargo clippy run: cargo clippy -- -D warnings @@ -57,5 +62,10 @@ jobs: - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable + - name: Install Protobuf compiler + run: | + sudo apt update && sudo apt upgrade -y + sudo apt install -y protobuf-compiler libprotobuf-dev + - name: Run cargo test run: cargo test --all-features -- --test-threads 1 From 0ac21ae148d07f4559f92998936374d54f96ec0b Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sun, 23 Jun 2024 13:42:37 -0500 Subject: [PATCH 14/88] feat: add blog to docs --- docs/blog/.authors.yml | 5 +++ docs/blog/index.md | 3 ++ docs/blog/posts/overhauling_oasysdb.md | 42 ++++++++++++++++++++++++++ mkdocs.yml | 11 +++++++ 4 files changed, 61 insertions(+) create mode 100644 docs/blog/.authors.yml create mode 100644 docs/blog/index.md create mode 100644 docs/blog/posts/overhauling_oasysdb.md diff --git a/docs/blog/.authors.yml b/docs/blog/.authors.yml new file mode 100644 index 00000000..d3c6b881 --- /dev/null +++ b/docs/blog/.authors.yml @@ -0,0 +1,5 @@ +authors: + edwinkys: + name: Edwin Kys + description: Author of OasysDB + avatar: https://avatars.githubusercontent.com/u/51223060?v=4 diff --git a/docs/blog/index.md b/docs/blog/index.md new file mode 100644 index 00000000..c517ef04 --- /dev/null +++ b/docs/blog/index.md @@ -0,0 +1,3 @@ +# Latest Posts + +Bite-sized blog posts about generative AI, machine learning, and more. diff --git a/docs/blog/posts/overhauling_oasysdb.md b/docs/blog/posts/overhauling_oasysdb.md new file mode 100644 index 00000000..963427e7 --- /dev/null +++ b/docs/blog/posts/overhauling_oasysdb.md @@ -0,0 +1,42 @@ +--- +date: 2024-06-22 + +authors: + - edwinkys + +categories: + - Log +--- + +# DevLog #1: OasysDB Overhaul + +OasysDB is a project that I started in January of this year, and honestly, it has been an incredible learning experience. With it, I've gained quite extensive experience in databases, machine learning algorithms, and low-level programming concepts. But, with this knowledge, I realize that the current design of OasysDB is not enough for production use. + + + +After careful consideration, I've decided to rewrite OasysDB from the ground up. The new version will be designed to incorporate all the essential features needed for a production-ready vector database system. + +This includes, but is not limited to: + +- Transitioning from an embedded to a client-server model for better scalability and isolation. +- Designing an efficient storage engine tailored for analytical production workloads. +- Implementing concurrent query processing to improve throughput and reduce latency. +- Utilizing advanced vector indexing algorithms for enhanced recall performance, especially in hybrid search scenarios. +- Incorporating an industry-standard query planner and optimizer to enhance query performance. +- Enhancing documentation and testing to ensure the system's robustness and reliability. + +Here's a high-level overview of the new architecture: + +![OasysDB Architecture](https://i.postimg.cc/QdVVSs3M/Infrastructure.png) + +## Progress Update + +Today, I started working on the new version of OasysDB. I've established the project structure, implemented the foundational data structures for the collection and storage engine, and set up the initial framework for client-server communication. + +I will be posting regular updates (once or twice a week) on my progress, which may include in-depth explorations of the system's technical aspects. If you want to follow along with the development process, you can find the project on GitHub: [OasysDB](https://github.com/oasysai/oasysdb). + +## Conclusion + +I'm really excited about the potential of the new OasysDB and the challenges that lie ahead. I believe this overhaul will lead to a robust and scalable vector database system perfect for a wide range of AI applications. + +If you're into databases and AI, I encourage you to follow along with the development process as I share my insights, challenges, and victories in this DevLog series. If you have experience in this field, your feedback and suggestions would be greatly appreciated. diff --git a/mkdocs.yml b/mkdocs.yml index 1c9b675c..44738310 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -67,6 +67,9 @@ nav: - Migrations: - 0.4.5 to 0.5.0: migrations/0.4.5_to_0.5.0.md + - Blog: + - blog/index.md + markdown_extensions: - attr_list - md_in_html @@ -84,6 +87,14 @@ markdown_extensions: - toc: permalink: "#" +plugins: + - blog: + post_readtime: true + post_excerpt: required + authors: true + categories_allowed: + - Log + exclude_docs: | pull_request_template.md security.md From 21a315a47ebb08ea4dfe866f1d990bd1c94a8cd5 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sun, 23 Jun 2024 23:06:24 -0500 Subject: [PATCH 15/88] feat: add ioerror interop --- src/types/error.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/types/error.rs b/src/types/error.rs index efeede95..55ad7780 100644 --- a/src/types/error.rs +++ b/src/types/error.rs @@ -3,13 +3,16 @@ use std::fmt::{Display, Formatter, Result}; // Other error types. use arrow::error::ArrowError; use std::error::Error as StandardError; +use std::io::Error as IOError; use std::sync::PoisonError; +#[allow(clippy::enum_variant_names)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ErrorCode { - Standard, - Concurrency, - Arrow, + StandardError, + FileError, + ConcurrencyError, + ArrowError, } #[derive(Debug)] @@ -38,21 +41,28 @@ impl StandardError for Error {} impl From> for Error { fn from(err: Box) -> Self { - let code = ErrorCode::Standard; + let code = ErrorCode::StandardError; Error::new(&code, &err.to_string()) } } impl From> for Error { fn from(err: PoisonError) -> Self { - let code = ErrorCode::Concurrency; + let code = ErrorCode::ConcurrencyError; + Error::new(&code, &err.to_string()) + } +} + +impl From for Error { + fn from(err: IOError) -> Self { + let code = ErrorCode::FileError; Error::new(&code, &err.to_string()) } } impl From for Error { fn from(err: ArrowError) -> Self { - let code = ErrorCode::Arrow; + let code = ErrorCode::ArrowError; Error::new(&code, &err.to_string()) } } From 8cb48861a897a6a567e43c064ab341101386d430 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sun, 23 Jun 2024 23:08:38 -0500 Subject: [PATCH 16/88] feat: add bincode error compatibility --- src/types/error.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/types/error.rs b/src/types/error.rs index 55ad7780..e2e61973 100644 --- a/src/types/error.rs +++ b/src/types/error.rs @@ -2,6 +2,7 @@ use std::fmt::{Display, Formatter, Result}; // Other error types. use arrow::error::ArrowError; +use bincode::ErrorKind as BincodeError; use std::error::Error as StandardError; use std::io::Error as IOError; use std::sync::PoisonError; @@ -9,10 +10,11 @@ use std::sync::PoisonError; #[allow(clippy::enum_variant_names)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ErrorCode { - StandardError, - FileError, - ConcurrencyError, ArrowError, + ConcurrencyError, + FileError, + SerializationError, + StandardError, } #[derive(Debug)] @@ -66,3 +68,10 @@ impl From for Error { Error::new(&code, &err.to_string()) } } + +impl From> for Error { + fn from(err: Box) -> Self { + let code = ErrorCode::SerializationError; + Error::new(&code, &err.to_string()) + } +} From ca6e53f28aed1db46563592a9d689304525ad542 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sun, 23 Jun 2024 23:08:57 -0500 Subject: [PATCH 17/88] build: add binary serialization crates --- Cargo.lock | 11 +++++++++++ Cargo.toml | 6 ++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 48f395d2..18abd562 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -396,6 +396,15 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -1046,8 +1055,10 @@ name = "oasysdb" version = "0.7.0" dependencies = [ "arrow", + "bincode", "prost", "rayon", + "serde", "tokio", "tonic", "tonic-build", diff --git a/Cargo.toml b/Cargo.toml index 70a73ef3..9023a0ad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,11 +27,13 @@ rayon = "1.10.0" prost = "0.12.6" tonic = "0.11.0" +# Serialization. +serde = "1.0.203" +bincode = "1.3.3" + [build-dependencies] tonic-build = "0.11.0" -[dev-dependencies] - [profile.release] lto = true opt-level = "z" From ed94cf7e038d51bbe9e423be3a1d4c7b73c122f8 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 24 Jun 2024 13:54:03 -0500 Subject: [PATCH 18/88] feat: implement initial database state --- Cargo.toml | 2 +- src/db/database.rs | 79 +++++++++++++++++++++++++++++++++++++++++++++- src/db/mod.rs | 5 +++ src/main.rs | 5 ++- 4 files changed, 88 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9023a0ad..2ed44b00 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,7 @@ prost = "0.12.6" tonic = "0.11.0" # Serialization. -serde = "1.0.203" +serde = { version = "1.0.203", features = ["derive"] } bincode = "1.3.3" [build-dependencies] diff --git a/src/db/database.rs b/src/db/database.rs index c3338a9e..2f658d07 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -1,8 +1,85 @@ use super::*; use crate::proto::database_server::Database as ProtoDatabase; use crate::proto::CreateCollectionRequest; +use serde::de::DeserializeOwned; -pub struct Database {} +// Database sub-directory structure. +const COLLECTIONS_DIR: &str = "collections"; +const INDICES_DIR: &str = "indices"; +const TMP_DIR: &str = "tmp"; +const SUBDIRS: [&str; 3] = [COLLECTIONS_DIR, INDICES_DIR, TMP_DIR]; + +// This is where the serialized database states are stored. +const STATE_FILE: &str = "state"; + +// Type aliases for improved readability. +type CollectionName = String; +type CollectionPath = PathBuf; + +#[derive(Debug, Serialize, Deserialize)] +pub struct DatabaseState { + collection_refs: HashMap, +} + +impl Default for DatabaseState { + fn default() -> Self { + Self { collection_refs: HashMap::new() } + } +} + +pub struct Database { + directory: PathBuf, + state: DatabaseState, +} + +impl Database { + pub fn open(directory: PathBuf) -> Result { + if !directory.try_exists()? { + Self::initialize_directory(&directory)?; + } + + let state = DatabaseState::default(); + let mut db = Self { directory, state }; + + db.restore_states()?; + Ok(db) + } + + fn initialize_directory(directory: &PathBuf) -> Result<(), Error> { + // Create the parent directory of the database. + fs::create_dir_all(directory)?; + + // Create the subdirectories for the database. + for subdir in SUBDIRS { + let subdir_path = directory.join(subdir); + fs::create_dir(&subdir_path)?; + } + + Ok(()) + } + + fn restore_states(&mut self) -> Result<(), Error> { + let state_file = self.directory.join(STATE_FILE); + + // If there are no state file, return early. + // This is not an error, as the database may be new. + if !state_file.try_exists()? { + return Ok(()); + } + + // Restore the database states. + self.state = Self::deserialize_binary_file(&state_file)?; + Ok(()) + } + + fn deserialize_binary_file( + path: &PathBuf, + ) -> Result { + let file = OpenOptions::new().read(true).open(path)?; + let reader = BufReader::new(file); + bincode::deserialize_from(reader).map_err(Into::into) + } +} #[tonic::async_trait] impl ProtoDatabase for Database { diff --git a/src/db/mod.rs b/src/db/mod.rs index 99e38d5f..bed5ce69 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,5 +1,10 @@ use crate::types::error::Error; use rayon::prelude::*; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::fs::{self, OpenOptions}; +use std::io::BufReader; +use std::path::PathBuf; use tonic::{Request, Response, Status}; pub mod collection; diff --git a/src/main.rs b/src/main.rs index d9a98b11..aff0e0d0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,6 +10,7 @@ mod tests; use db::database::Database; use proto::database_server::DatabaseServer; +use std::path::PathBuf; use tonic::transport::Server; const HOST: &str = "0.0.0.0"; @@ -18,7 +19,9 @@ const PORT: u16 = 2525; #[tokio::main] async fn main() -> Result<(), Box> { let addr = format!("{HOST}:{PORT}").parse()?; - let database = Database {}; + + let path = PathBuf::from("/tmp/oasysdb"); + let database = Database::open(path)?; Server::builder() .add_service(DatabaseServer::new(database)) From 188551ae819dbf599212606084b9e586e2bac4c1 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 24 Jun 2024 16:08:10 -0500 Subject: [PATCH 19/88] feat: add database create collection rpc --- src/db/collection.rs | 1 - src/db/database.rs | 95 ++++++++++++++++++++++++++++++++++---- src/db/mod.rs | 5 +- src/main.rs | 3 -- src/tests/mod.rs | 21 +++++++++ src/tests/test_database.rs | 12 +++++ src/types/error.rs | 16 ++++++- 7 files changed, 136 insertions(+), 17 deletions(-) create mode 100644 src/tests/test_database.rs diff --git a/src/db/collection.rs b/src/db/collection.rs index 92001f26..5f6514b5 100644 --- a/src/db/collection.rs +++ b/src/db/collection.rs @@ -1,7 +1,6 @@ use super::*; use arrow::array::RecordBatch; use arrow::datatypes::{Fields, Schema}; -use std::sync::{Arc, RwLock as Lock}; pub struct Collection { schema: Lock, diff --git a/src/db/database.rs b/src/db/database.rs index 2f658d07..135c2a15 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -10,15 +10,15 @@ const TMP_DIR: &str = "tmp"; const SUBDIRS: [&str; 3] = [COLLECTIONS_DIR, INDICES_DIR, TMP_DIR]; // This is where the serialized database states are stored. -const STATE_FILE: &str = "state"; +const STATE_FILE: &str = "dbstate"; // Type aliases for improved readability. type CollectionName = String; type CollectionPath = PathBuf; -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct DatabaseState { - collection_refs: HashMap, + pub collection_refs: HashMap, } impl Default for DatabaseState { @@ -29,7 +29,7 @@ impl Default for DatabaseState { pub struct Database { directory: PathBuf, - state: DatabaseState, + state: Lock, } impl Database { @@ -38,13 +38,23 @@ impl Database { Self::initialize_directory(&directory)?; } - let state = DatabaseState::default(); + let state = Lock::new(DatabaseState::default()); let mut db = Self { directory, state }; - db.restore_states()?; + db.restore_state()?; Ok(db) } + pub fn persist_state(&self) -> Result<(), Error> { + let state = self.state.read()?.clone(); + let state_file = self.directory.join(STATE_FILE); + self.write_binary_file(&state, &state_file) + } + + pub fn state(&self) -> Result { + Ok(self.state.read()?.clone()) + } + fn initialize_directory(directory: &PathBuf) -> Result<(), Error> { // Create the parent directory of the database. fs::create_dir_all(directory)?; @@ -58,7 +68,7 @@ impl Database { Ok(()) } - fn restore_states(&mut self) -> Result<(), Error> { + fn restore_state(&mut self) -> Result<(), Error> { let state_file = self.directory.join(STATE_FILE); // If there are no state file, return early. @@ -68,17 +78,79 @@ impl Database { } // Restore the database states. - self.state = Self::deserialize_binary_file(&state_file)?; + self.state = Self::read_binary_file(&state_file)?; Ok(()) } - fn deserialize_binary_file( + fn read_binary_file( path: &PathBuf, ) -> Result { let file = OpenOptions::new().read(true).open(path)?; let reader = BufReader::new(file); bincode::deserialize_from(reader).map_err(Into::into) } + + fn write_binary_file( + &self, + data: &T, + path: &PathBuf, + ) -> Result<(), Error> { + let filename = path.file_name().ok_or_else(|| { + // This error should never happen unless the user tinkers with it. + let code = ErrorCode::FileError; + let message = format!("Invalid file path: {path:?}"); + Error::new(&code, &message) + })?; + + // Write the data to a temporary file first. + // If this fails, the original file will not be overwritten. + let tmp_path = self.directory.join(TMP_DIR).join(filename); + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(&tmp_path)?; + + let writer = BufWriter::new(file); + bincode::serialize_into(writer, data)?; + + // If the serialization is successful, rename the temporary file. + fs::rename(&tmp_path, path)?; + Ok(()) + } +} + +// This implementation block contains methods used by the gRPC server. +// We do this to make it easier to test the database logic. +impl Database { + pub fn _create_collection(&self, name: &str) -> Result<(), Error> { + let mut state = self.state.write()?; + + // Check if the collection already exists. + if state.collection_refs.contains_key(name) { + let code = ErrorCode::ClientError; + let message = format!("Collection already exists: {name}"); + return Err(Error::new(&code, &message)); + } + + // Create the collection directory. + let collection_dir = self.directory.join(COLLECTIONS_DIR).join(name); + fs::create_dir(&collection_dir)?; + + // Update the database state. + *state = { + let mut _state = state.clone(); + _state.collection_refs.insert(name.to_string(), collection_dir); + _state + }; + + // Drop the lock to prevent deadlocks since + // persist_state also requires the lock. + drop(state); + + self.persist_state()?; + Ok(()) + } } #[tonic::async_trait] @@ -87,6 +159,9 @@ impl ProtoDatabase for Database { &self, request: Request, ) -> Result, Status> { - unimplemented!(); + let request = request.into_inner(); + let name = request.name; + self._create_collection(&name)?; + Ok(Response::new(())) } } diff --git a/src/db/mod.rs b/src/db/mod.rs index bed5ce69..912feb29 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,10 +1,11 @@ -use crate::types::error::Error; +use crate::types::error::{Error, ErrorCode}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::fs::{self, OpenOptions}; -use std::io::BufReader; +use std::io::{BufReader, BufWriter}; use std::path::PathBuf; +use std::sync::{Arc, RwLock as Lock}; use tonic::{Request, Response, Status}; pub mod collection; diff --git a/src/main.rs b/src/main.rs index aff0e0d0..22322199 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,3 @@ -// TODO: Remove this line when the code is ready -#![allow(dead_code)] - mod db; mod proto; mod types; diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 01d0a8bd..b2b95500 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,3 +1,24 @@ +use crate::db::database::Database; use crate::types::error::Error; +use std::fs; +use std::path::PathBuf; mod test_collection; +mod test_database; + +const TEST_DIR: &str = "/tmp/oasysdb"; + +fn create_new_test_database() -> Result { + // Reset the database directory for testing. + let path = PathBuf::from(TEST_DIR); + if path.exists() { + fs::remove_dir_all(&path)?; + } + + // The database should have some subdirectories. + let db = Database::open(path.clone())?; + let subdirs = path.read_dir()?; + assert!(subdirs.count() >= 3); + + Ok(db) +} diff --git a/src/tests/test_database.rs b/src/tests/test_database.rs new file mode 100644 index 00000000..9199b106 --- /dev/null +++ b/src/tests/test_database.rs @@ -0,0 +1,12 @@ +use super::*; + +#[test] +fn test_create_collection() -> Result<(), Error> { + let db = create_new_test_database()?; + db._create_collection("test_collection")?; + + let state = db.state()?; + assert!(state.collection_refs.contains_key("test_collection")); + + Ok(()) +} diff --git a/src/types/error.rs b/src/types/error.rs index e2e61973..91b3a002 100644 --- a/src/types/error.rs +++ b/src/types/error.rs @@ -11,6 +11,7 @@ use std::sync::PoisonError; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ErrorCode { ArrowError, + ClientError, ConcurrencyError, FileError, SerializationError, @@ -37,7 +38,7 @@ impl Display for Error { } } -// Implement other interoperability to other error types. +// Implement interoperability FROM other external error types. impl StandardError for Error {} @@ -75,3 +76,16 @@ impl From> for Error { Error::new(&code, &err.to_string()) } } + +// Implement interoperability INTO other external error types. + +impl From for tonic::Status { + fn from(err: Error) -> Self { + let code = match err.code { + ErrorCode::ClientError => tonic::Code::InvalidArgument, + _ => tonic::Code::Internal, + }; + + tonic::Status::new(code, err.message) + } +} From f0f92050808e16290084ed3deb409c81a3c31811 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 24 Jun 2024 16:09:50 -0500 Subject: [PATCH 20/88] fix: improve linting suggestions --- src/db/database.rs | 8 +------- src/main.rs | 2 ++ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index 135c2a15..e28fe5bb 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -16,17 +16,11 @@ const STATE_FILE: &str = "dbstate"; type CollectionName = String; type CollectionPath = PathBuf; -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct DatabaseState { pub collection_refs: HashMap, } -impl Default for DatabaseState { - fn default() -> Self { - Self { collection_refs: HashMap::new() } - } -} - pub struct Database { directory: PathBuf, state: Lock, diff --git a/src/main.rs b/src/main.rs index 22322199..f657d6c4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,5 @@ +#![allow(dead_code)] + mod db; mod proto; mod types; From 04afa370d0b680087f1c22ec1eb708481faf8de8 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 24 Jun 2024 16:22:11 -0500 Subject: [PATCH 21/88] feat: move db proto to separate file --- src/db/database.rs | 15 --------------- src/db/database_service.rs | 16 ++++++++++++++++ src/db/mod.rs | 1 + 3 files changed, 17 insertions(+), 15 deletions(-) create mode 100644 src/db/database_service.rs diff --git a/src/db/database.rs b/src/db/database.rs index e28fe5bb..3c92f210 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -1,6 +1,4 @@ use super::*; -use crate::proto::database_server::Database as ProtoDatabase; -use crate::proto::CreateCollectionRequest; use serde::de::DeserializeOwned; // Database sub-directory structure. @@ -146,16 +144,3 @@ impl Database { Ok(()) } } - -#[tonic::async_trait] -impl ProtoDatabase for Database { - async fn create_collection( - &self, - request: Request, - ) -> Result, Status> { - let request = request.into_inner(); - let name = request.name; - self._create_collection(&name)?; - Ok(Response::new(())) - } -} diff --git a/src/db/database_service.rs b/src/db/database_service.rs new file mode 100644 index 00000000..85156660 --- /dev/null +++ b/src/db/database_service.rs @@ -0,0 +1,16 @@ +use super::database::Database; +use super::*; +use crate::proto::database_server::Database as ProtoDatabase; +use crate::proto::CreateCollectionRequest; + +#[tonic::async_trait] +impl ProtoDatabase for Database { + async fn create_collection( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + self._create_collection(&request.name)?; + Ok(Response::new(())) + } +} diff --git a/src/db/mod.rs b/src/db/mod.rs index 912feb29..05fd5959 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -10,3 +10,4 @@ use tonic::{Request, Response, Status}; pub mod collection; pub mod database; +pub mod database_service; From 70bedfbabd6bf477f5cf8a95554cc310570d2c2f Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 24 Jun 2024 17:15:28 -0500 Subject: [PATCH 22/88] feat: add collection name validator --- Cargo.lock | 1 + Cargo.toml | 1 + src/db/database.rs | 22 +++++++++++++++++++++- src/tests/test_database.rs | 6 +++--- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 18abd562..9185d6c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1058,6 +1058,7 @@ dependencies = [ "bincode", "prost", "rayon", + "regex", "serde", "tokio", "tonic", diff --git a/Cargo.toml b/Cargo.toml index 2ed44b00..1fc42321 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ features = ["macros", "rt-multi-thread"] [dependencies] arrow = "52.0.0" rayon = "1.10.0" +regex = "1.10.5" # gRPC stuff. prost = "0.12.6" diff --git a/src/db/database.rs b/src/db/database.rs index 3c92f210..da69cf23 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -1,4 +1,5 @@ use super::*; +use regex::Regex; use serde::de::DeserializeOwned; // Database sub-directory structure. @@ -116,9 +117,10 @@ impl Database { // We do this to make it easier to test the database logic. impl Database { pub fn _create_collection(&self, name: &str) -> Result<(), Error> { - let mut state = self.state.write()?; + Self::validate_collection_name(name)?; // Check if the collection already exists. + let mut state = self.state.write()?; if state.collection_refs.contains_key(name) { let code = ErrorCode::ClientError; let message = format!("Collection already exists: {name}"); @@ -143,4 +145,22 @@ impl Database { self.persist_state()?; Ok(()) } + + fn validate_collection_name(name: &str) -> Result<(), Error> { + if name.is_empty() { + let code = ErrorCode::ClientError; + let message = "Collection name cannot be empty"; + return Err(Error::new(&code, message)); + } + + let re = Regex::new(r"^[a-z_]+$").unwrap(); + if !re.is_match(name) { + let code = ErrorCode::ClientError; + let message = "Collection name must be lowercase letters \ + with underscores."; + return Err(Error::new(&code, &message)); + } + + Ok(()) + } } diff --git a/src/tests/test_database.rs b/src/tests/test_database.rs index 9199b106..56a1fe08 100644 --- a/src/tests/test_database.rs +++ b/src/tests/test_database.rs @@ -3,10 +3,10 @@ use super::*; #[test] fn test_create_collection() -> Result<(), Error> { let db = create_new_test_database()?; - db._create_collection("test_collection")?; + let name = "collection"; + db._create_collection(name)?; let state = db.state()?; - assert!(state.collection_refs.contains_key("test_collection")); - + assert!(state.collection_refs.contains_key(name)); Ok(()) } From 4f52705009c24272c645dd83a74d5ff3dbf64851 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 24 Jun 2024 18:18:41 -0500 Subject: [PATCH 23/88] feat: collection name on disk use uuid --- Cargo.lock | 10 ++++++++++ Cargo.toml | 1 + src/db/database.rs | 11 +++++------ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9185d6c3..5a3eeae3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1063,6 +1063,7 @@ dependencies = [ "tokio", "tonic", "tonic-build", + "uuid", ] [[package]] @@ -1611,6 +1612,15 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "uuid" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439" +dependencies = [ + "getrandom", +] + [[package]] name = "version_check" version = "0.9.4" diff --git a/Cargo.toml b/Cargo.toml index 1fc42321..4f572ed8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ features = ["macros", "rt-multi-thread"] arrow = "52.0.0" rayon = "1.10.0" regex = "1.10.5" +uuid = { version = "1.9.1", features = ["v4", "rng"] } # gRPC stuff. prost = "0.12.6" diff --git a/src/db/database.rs b/src/db/database.rs index da69cf23..68cf4ded 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -1,6 +1,7 @@ use super::*; use regex::Regex; use serde::de::DeserializeOwned; +use uuid::Uuid; // Database sub-directory structure. const COLLECTIONS_DIR: &str = "collections"; @@ -128,15 +129,13 @@ impl Database { } // Create the collection directory. - let collection_dir = self.directory.join(COLLECTIONS_DIR).join(name); + let uuid = Uuid::new_v4().to_string(); + let collection_dir = self.directory.join(COLLECTIONS_DIR).join(uuid); fs::create_dir(&collection_dir)?; // Update the database state. - *state = { - let mut _state = state.clone(); - _state.collection_refs.insert(name.to_string(), collection_dir); - _state - }; + state.collection_refs.insert(name.to_string(), collection_dir); + *state = state.clone(); // Drop the lock to prevent deadlocks since // persist_state also requires the lock. From fcf0c0b45f18a0d48b06fa35bf14d6c5fa6398f4 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 24 Jun 2024 18:21:05 -0500 Subject: [PATCH 24/88] feat: add delete collection rpc --- protos/database.proto | 6 ++++++ src/db/database.rs | 19 +++++++++++++++++++ src/db/database_service.rs | 11 ++++++++++- src/tests/mod.rs | 4 +++- src/tests/test_database.rs | 14 ++++++++++++-- 5 files changed, 50 insertions(+), 4 deletions(-) diff --git a/protos/database.proto b/protos/database.proto index 1fdfe312..8fde0ad3 100644 --- a/protos/database.proto +++ b/protos/database.proto @@ -5,9 +5,15 @@ import "google/protobuf/empty.proto"; service Database { rpc CreateCollection(CreateCollectionRequest) returns (google.protobuf.Empty); + rpc DeleteCollection(DeleteCollectionRequest) returns (google.protobuf.Empty); } // region CreateCollection message CreateCollectionRequest { string name = 1; } + +// region DeleteCollection + message DeleteCollectionRequest { + string name = 1; + } diff --git a/src/db/database.rs b/src/db/database.rs index 68cf4ded..40d13335 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -145,6 +145,25 @@ impl Database { Ok(()) } + pub fn _delete_collection(&self, name: &str) -> Result<(), Error> { + let mut state = self.state.write()?; + if !state.collection_refs.contains_key(name) { + return Ok(()); + } + + // Delete the collection directory. + // We can unwrap here because we checked if the collection exists. + let collection_dir = state.collection_refs.remove(name).unwrap(); + fs::remove_dir_all(&collection_dir)?; + + // Update the database state. + *state = state.clone(); + drop(state); + + self.persist_state()?; + Ok(()) + } + fn validate_collection_name(name: &str) -> Result<(), Error> { if name.is_empty() { let code = ErrorCode::ClientError; diff --git a/src/db/database_service.rs b/src/db/database_service.rs index 85156660..ee97450a 100644 --- a/src/db/database_service.rs +++ b/src/db/database_service.rs @@ -1,7 +1,7 @@ use super::database::Database; use super::*; use crate::proto::database_server::Database as ProtoDatabase; -use crate::proto::CreateCollectionRequest; +use crate::proto::*; #[tonic::async_trait] impl ProtoDatabase for Database { @@ -13,4 +13,13 @@ impl ProtoDatabase for Database { self._create_collection(&request.name)?; Ok(Response::new(())) } + + async fn delete_collection( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + self._delete_collection(&request.name)?; + Ok(Response::new(())) + } } diff --git a/src/tests/mod.rs b/src/tests/mod.rs index b2b95500..cf36fb68 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -7,8 +7,9 @@ mod test_collection; mod test_database; const TEST_DIR: &str = "/tmp/oasysdb"; +const TEST_COLLECTION: &str = "collection"; -fn create_new_test_database() -> Result { +fn create_test_database() -> Result { // Reset the database directory for testing. let path = PathBuf::from(TEST_DIR); if path.exists() { @@ -20,5 +21,6 @@ fn create_new_test_database() -> Result { let subdirs = path.read_dir()?; assert!(subdirs.count() >= 3); + db._create_collection(TEST_COLLECTION)?; Ok(db) } diff --git a/src/tests/test_database.rs b/src/tests/test_database.rs index 56a1fe08..abfed2eb 100644 --- a/src/tests/test_database.rs +++ b/src/tests/test_database.rs @@ -2,11 +2,21 @@ use super::*; #[test] fn test_create_collection() -> Result<(), Error> { - let db = create_new_test_database()?; - let name = "collection"; + let db = create_test_database()?; + let name = "new_collection"; db._create_collection(name)?; let state = db.state()?; assert!(state.collection_refs.contains_key(name)); Ok(()) } + +#[test] +fn test_delete_collection() -> Result<(), Error> { + let db = create_test_database()?; + db._delete_collection(TEST_COLLECTION)?; + + let state = db.state()?; + assert!(!state.collection_refs.contains_key(TEST_COLLECTION)); + Ok(()) +} From ed2be7cd50cb5f8942ec2083a7907b6cd44b38ed Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 24 Jun 2024 19:10:43 -0500 Subject: [PATCH 25/88] refactor: create state file for new db --- src/db/database.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index 40d13335..1180e602 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -28,13 +28,21 @@ pub struct Database { impl Database { pub fn open(directory: PathBuf) -> Result { - if !directory.try_exists()? { + // If it's a new database, we want to initialize everything need. + let mut new = false; + if !directory.join(STATE_FILE).try_exists()? { Self::initialize_directory(&directory)?; + new = true; } let state = Lock::new(DatabaseState::default()); let mut db = Self { directory, state }; + // This creates initial empty state file for new databases. + if new { + db.persist_state()?; + } + db.restore_state()?; Ok(db) } @@ -64,14 +72,6 @@ impl Database { fn restore_state(&mut self) -> Result<(), Error> { let state_file = self.directory.join(STATE_FILE); - - // If there are no state file, return early. - // This is not an error, as the database may be new. - if !state_file.try_exists()? { - return Ok(()); - } - - // Restore the database states. self.state = Self::read_binary_file(&state_file)?; Ok(()) } From c6253669a57c21e9519d6b3958f2902e2fac5bdd Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 24 Jun 2024 20:35:12 -0500 Subject: [PATCH 26/88] feat: write initial collection state when creating collection --- Cargo.lock | 4 ++++ Cargo.toml | 1 + src/db/collection.rs | 44 ++++++++++++++++++++++-------------- src/db/database.rs | 14 ++++++++---- src/db/mod.rs | 2 ++ src/tests/test_collection.rs | 11 +++++---- 6 files changed, 50 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5a3eeae3..96108f71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -244,6 +244,9 @@ name = "arrow-schema" version = "52.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32aae6a60458a2389c0da89c9de0b7932427776127da1a738e2efc21d32f3393" +dependencies = [ + "serde", +] [[package]] name = "arrow-select" @@ -1055,6 +1058,7 @@ name = "oasysdb" version = "0.7.0" dependencies = [ "arrow", + "arrow-schema", "bincode", "prost", "rayon", diff --git a/Cargo.toml b/Cargo.toml index 4f572ed8..7e7083a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ features = ["macros", "rt-multi-thread"] [dependencies] arrow = "52.0.0" +arrow-schema = { version = "52.0.0", features = ["serde"] } rayon = "1.10.0" regex = "1.10.5" uuid = { version = "1.9.1", features = ["v4", "rng"] } diff --git a/src/db/collection.rs b/src/db/collection.rs index 5f6514b5..91cdde0b 100644 --- a/src/db/collection.rs +++ b/src/db/collection.rs @@ -1,24 +1,38 @@ use super::*; use arrow::array::RecordBatch; -use arrow::datatypes::{Fields, Schema}; +use arrow::datatypes::Fields; +use arrow_schema::Schema; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CollectionState { + pub schema: Schema, + pub count: usize, +} + +impl Default for CollectionState { + fn default() -> Self { + Self { schema: Schema::empty(), count: 0 } + } +} pub struct Collection { - schema: Lock, data: Lock>, - count: Lock, + state: Lock, } impl Collection { - pub fn new() -> Self { - let schema = Lock::new(Schema::empty()); + pub fn new() -> Result { let data = Lock::new(vec![]); - let count = Lock::new(0); - Self { schema, data, count } + let state = Lock::new(CollectionState::default()); + let collection = Self { data, state }; + Ok(collection) } pub fn add_fields(&self, fields: impl Into) -> Result<(), Error> { + let mut state = self.state.write()?; + // Create a new schema with the new field. - let mut schema = self.schema.write()?; + let schema = &state.schema; let schemas = vec![schema.clone(), Schema::new(fields)]; let new_schema = Schema::try_merge(schemas)?; @@ -34,19 +48,15 @@ impl Collection { let mut data = self.data.write()?; let migrated_data = data.par_iter().map(migrate_data).collect(); - // Update the schema and data. - *schema = new_schema; + // Update the state and data. + state.schema = new_schema; + *state = state.clone(); *data = migrated_data; Ok(()) } - pub fn count(&self) -> usize { - *self.count.read().unwrap() - } - - pub fn schema(&self) -> Result { - let schema = self.schema.read()?; - Ok(schema.clone()) + pub fn state(&self) -> Result { + Ok(self.state.read()?.clone()) } } diff --git a/src/db/database.rs b/src/db/database.rs index 1180e602..b3b0ae6c 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -10,7 +10,8 @@ const TMP_DIR: &str = "tmp"; const SUBDIRS: [&str; 3] = [COLLECTIONS_DIR, INDICES_DIR, TMP_DIR]; // This is where the serialized database states are stored. -const STATE_FILE: &str = "dbstate"; +const DB_STATE_FILE: &str = "dbstate"; +const COLLECTION_STATE_FILE: &str = "cstate"; // Type aliases for improved readability. type CollectionName = String; @@ -30,7 +31,7 @@ impl Database { pub fn open(directory: PathBuf) -> Result { // If it's a new database, we want to initialize everything need. let mut new = false; - if !directory.join(STATE_FILE).try_exists()? { + if !directory.join(DB_STATE_FILE).try_exists()? { Self::initialize_directory(&directory)?; new = true; } @@ -49,7 +50,7 @@ impl Database { pub fn persist_state(&self) -> Result<(), Error> { let state = self.state.read()?.clone(); - let state_file = self.directory.join(STATE_FILE); + let state_file = self.directory.join(DB_STATE_FILE); self.write_binary_file(&state, &state_file) } @@ -71,7 +72,7 @@ impl Database { } fn restore_state(&mut self) -> Result<(), Error> { - let state_file = self.directory.join(STATE_FILE); + let state_file = self.directory.join(DB_STATE_FILE); self.state = Self::read_binary_file(&state_file)?; Ok(()) } @@ -133,6 +134,11 @@ impl Database { let collection_dir = self.directory.join(COLLECTIONS_DIR).join(uuid); fs::create_dir(&collection_dir)?; + // Initialize the collection state. + let collection_state = CollectionState::default(); + let collection_state_file = collection_dir.join(COLLECTION_STATE_FILE); + self.write_binary_file(&collection_state, &collection_state_file)?; + // Update the database state. state.collection_refs.insert(name.to_string(), collection_dir); *state = state.clone(); diff --git a/src/db/mod.rs b/src/db/mod.rs index 05fd5959..44809478 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -11,3 +11,5 @@ use tonic::{Request, Response, Status}; pub mod collection; pub mod database; pub mod database_service; + +use collection::*; diff --git a/src/tests/test_collection.rs b/src/tests/test_collection.rs index 681533c0..8b898e44 100644 --- a/src/tests/test_collection.rs +++ b/src/tests/test_collection.rs @@ -3,18 +3,19 @@ use crate::db::collection::Collection; use arrow::datatypes::{DataType, Field}; #[test] -fn test_collection_new() { - let collection = Collection::new(); - assert_eq!(collection.count(), 0); +fn test_collection_new() -> Result<(), Error> { + let collection = Collection::new()?; + assert_eq!(collection.state()?.count, 0); + Ok(()) } #[test] fn test_collection_add_field() -> Result<(), Error> { - let collection = Collection::new(); + let collection = Collection::new()?; let field = Field::new("id", DataType::Utf8, false); collection.add_fields(vec![field])?; - let schema = collection.schema()?; + let schema = collection.state()?.schema; assert_eq!(schema.fields().len(), 1); assert_eq!(schema.field(0).name(), "id"); From 8ce2670f40ccb6cb8f2c00cb5d73a77e67bc32e8 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 24 Jun 2024 20:37:15 -0500 Subject: [PATCH 27/88] style: fix clipy linting --- src/db/database.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index b3b0ae6c..224cdbaa 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -160,7 +160,7 @@ impl Database { // Delete the collection directory. // We can unwrap here because we checked if the collection exists. let collection_dir = state.collection_refs.remove(name).unwrap(); - fs::remove_dir_all(&collection_dir)?; + fs::remove_dir_all(collection_dir)?; // Update the database state. *state = state.clone(); @@ -182,7 +182,7 @@ impl Database { let code = ErrorCode::ClientError; let message = "Collection name must be lowercase letters \ with underscores."; - return Err(Error::new(&code, &message)); + return Err(Error::new(&code, message)); } Ok(()) From bedb6a72834cf168b06415850e4dcc0c6e1e7524 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Wed, 26 Jun 2024 13:16:20 -0500 Subject: [PATCH 28/88] feat: add add fields to collection --- .gitignore | 3 + protos/database.proto | 14 +++ src/db/collection.rs | 120 ++++++++++++++++++----- src/db/database.rs | 180 ++++++++++++++--------------------- src/db/database_service.rs | 27 +++++- src/db/mod.rs | 33 +++++-- src/main.rs | 4 +- src/tests/mod.rs | 22 ++++- src/tests/test_collection.rs | 11 +-- src/tests/test_database.rs | 26 ++++- src/types/error.rs | 1 + src/types/file.rs | 80 ++++++++++++++++ src/types/metadata.rs | 52 ++++++++++ src/types/mod.rs | 8 +- 14 files changed, 423 insertions(+), 158 deletions(-) create mode 100644 src/types/file.rs create mode 100644 src/types/metadata.rs diff --git a/.gitignore b/.gitignore index 738f868d..d2809d4f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# OasysDB tests. +odb* + # Rust stuff. debug target diff --git a/protos/database.proto b/protos/database.proto index 8fde0ad3..1a390ab5 100644 --- a/protos/database.proto +++ b/protos/database.proto @@ -6,6 +6,8 @@ import "google/protobuf/empty.proto"; service Database { rpc CreateCollection(CreateCollectionRequest) returns (google.protobuf.Empty); rpc DeleteCollection(DeleteCollectionRequest) returns (google.protobuf.Empty); + + rpc AddFields(AddFieldsRequest) returns (google.protobuf.Empty); } // region CreateCollection @@ -17,3 +19,15 @@ service Database { message DeleteCollectionRequest { string name = 1; } + +// region AddFields + message Field { + string name = 1; + string datatype = 2; + bool nullable = 3; + } + + message AddFieldsRequest { + string collection_name = 1; + repeated Field fields = 2; + } diff --git a/src/db/collection.rs b/src/db/collection.rs index 91cdde0b..3a513e93 100644 --- a/src/db/collection.rs +++ b/src/db/collection.rs @@ -1,7 +1,6 @@ use super::*; -use arrow::array::RecordBatch; -use arrow::datatypes::Fields; -use arrow_schema::Schema; +use arrow::ipc::writer::FileWriter; +use arrow::record_batch::RecordBatch; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CollectionState { @@ -9,54 +8,125 @@ pub struct CollectionState { pub count: usize, } -impl Default for CollectionState { - fn default() -> Self { - Self { schema: Schema::empty(), count: 0 } +impl CollectionState { + fn new() -> Self { + let field_id = Field::new("internal_id", DataType::Int32, false); + + let vector_type = MetadataType::Vector.into(); + let field_vector = Field::new("vector", vector_type, false); + + // The default schema for a new collection contains two fields: + // internal_id and vector. + let schema = Schema::new(vec![field_id, field_vector]); + Self { schema, count: 0 } + } +} + +struct Directories { + pub root: PathBuf, + pub state_file: PathBuf, + pub data_file: PathBuf, +} + +impl Directories { + fn new(root: PathBuf) -> Self { + let state_file = root.join("cstate"); + let data_file = root.join("cdata"); + Self { root, state_file, data_file } } } pub struct Collection { - data: Lock>, + dirs: Directories, state: Lock, } impl Collection { - pub fn new() -> Result { - let data = Lock::new(vec![]); - let state = Lock::new(CollectionState::default()); - let collection = Self { data, state }; + pub fn open(dir: PathBuf) -> Result { + if !dir.try_exists()? { + fs::create_dir_all(&dir)?; + } + + let dirs = Directories::new(dir); + let state = if !dirs.state_file.try_exists()? { + let state = Self::initialize_state(&dirs.state_file)?; + Self::initialize_data_file(&dirs.data_file, &state.schema)?; + state + } else { + Self::read_state(&dirs.state_file)? + }; + + let state = Lock::new(state); + let collection = Self { dirs, state }; Ok(collection) } + /// Creates an empty data file for the collection. + /// This method should only be called once, when the collection is created. + fn initialize_data_file( + path: &PathBuf, + schema: &Schema, + ) -> Result<(), Error> { + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(path)?; + + let writer = BufWriter::new(file); + let mut file_writer = FileWriter::try_new(writer, schema)?; + + let record = RecordBatch::new_empty(Arc::new(schema.clone())); + file_writer.write(&record)?; + + file_writer.finish()?; + Ok(()) + } + pub fn add_fields(&self, fields: impl Into) -> Result<(), Error> { let mut state = self.state.write()?; + // OasysDB doesn't support adding fields to a non-empty + // collection due to the nature of the indexing system. + if state.count > 0 { + let code = ErrorCode::ClientError; + let message = "Unable to add fields to a non-empty collection"; + return Err(Error::new(&code, message)); + } + // Create a new schema with the new field. let schema = &state.schema; let schemas = vec![schema.clone(), Schema::new(fields)]; let new_schema = Schema::try_merge(schemas)?; - // Migrate the data to the new schema. - let migrate_data = |batch: &RecordBatch| { - let schema = Arc::new(new_schema.clone()); - - // We can unwrap here because the new schema is guaranted - // to be a superset of the old schema. - batch.clone().with_schema(schema).unwrap() - }; - - let mut data = self.data.write()?; - let migrated_data = data.par_iter().map(migrate_data).collect(); - // Update the state and data. state.schema = new_schema; *state = state.clone(); - *data = migrated_data; + drop(state); + self.persist_state()?; Ok(()) } +} + +impl StateMachine for Collection { + fn initialize_state(path: &PathBuf) -> Result { + let state = CollectionState::new(); + FileOps::default().write_binary_file(path, &state)?; + Ok(state) + } - pub fn state(&self) -> Result { + fn read_state(path: &PathBuf) -> Result { + FileOps::default().read_binary_file(path) + } + + fn state(&self) -> Result { Ok(self.state.read()?.clone()) } + + fn persist_state(&self) -> Result<(), Error> { + let state = self.state.read()?.clone(); + let file_ops = FileOps::default(); + file_ops.write_binary_file(&self.dirs.state_file, &state) + } } diff --git a/src/db/database.rs b/src/db/database.rs index 224cdbaa..e376b70f 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -1,118 +1,48 @@ use super::*; use regex::Regex; -use serde::de::DeserializeOwned; use uuid::Uuid; -// Database sub-directory structure. -const COLLECTIONS_DIR: &str = "collections"; -const INDICES_DIR: &str = "indices"; -const TMP_DIR: &str = "tmp"; -const SUBDIRS: [&str; 3] = [COLLECTIONS_DIR, INDICES_DIR, TMP_DIR]; - -// This is where the serialized database states are stored. -const DB_STATE_FILE: &str = "dbstate"; -const COLLECTION_STATE_FILE: &str = "cstate"; - -// Type aliases for improved readability. -type CollectionName = String; -type CollectionPath = PathBuf; - #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct DatabaseState { - pub collection_refs: HashMap, + pub collection_refs: HashMap, +} + +struct Directories { + pub root: PathBuf, + pub collections_dir: PathBuf, + pub state_file: PathBuf, +} + +impl Directories { + fn new(root: PathBuf) -> Self { + let collections_dir = root.join("collections"); + let state_file = root.join("dbstate"); + Self { root, collections_dir, state_file } + } } pub struct Database { - directory: PathBuf, + dirs: Directories, state: Lock, } impl Database { - pub fn open(directory: PathBuf) -> Result { - // If it's a new database, we want to initialize everything need. - let mut new = false; - if !directory.join(DB_STATE_FILE).try_exists()? { - Self::initialize_directory(&directory)?; - new = true; - } - - let state = Lock::new(DatabaseState::default()); - let mut db = Self { directory, state }; - - // This creates initial empty state file for new databases. - if new { - db.persist_state()?; - } - - db.restore_state()?; + pub fn open(dir: PathBuf) -> Result { + let dirs = Directories::new(dir); + + let state_file = &dirs.state_file; + let state = if !state_file.try_exists()? { + // Creating a collection directory will create the root directory. + fs::create_dir_all(&dirs.collections_dir)?; + Self::initialize_state(&state_file)? + } else { + Self::read_state(&state_file)? + }; + + let state = Lock::new(state); + let db = Self { dirs, state }; Ok(db) } - - pub fn persist_state(&self) -> Result<(), Error> { - let state = self.state.read()?.clone(); - let state_file = self.directory.join(DB_STATE_FILE); - self.write_binary_file(&state, &state_file) - } - - pub fn state(&self) -> Result { - Ok(self.state.read()?.clone()) - } - - fn initialize_directory(directory: &PathBuf) -> Result<(), Error> { - // Create the parent directory of the database. - fs::create_dir_all(directory)?; - - // Create the subdirectories for the database. - for subdir in SUBDIRS { - let subdir_path = directory.join(subdir); - fs::create_dir(&subdir_path)?; - } - - Ok(()) - } - - fn restore_state(&mut self) -> Result<(), Error> { - let state_file = self.directory.join(DB_STATE_FILE); - self.state = Self::read_binary_file(&state_file)?; - Ok(()) - } - - fn read_binary_file( - path: &PathBuf, - ) -> Result { - let file = OpenOptions::new().read(true).open(path)?; - let reader = BufReader::new(file); - bincode::deserialize_from(reader).map_err(Into::into) - } - - fn write_binary_file( - &self, - data: &T, - path: &PathBuf, - ) -> Result<(), Error> { - let filename = path.file_name().ok_or_else(|| { - // This error should never happen unless the user tinkers with it. - let code = ErrorCode::FileError; - let message = format!("Invalid file path: {path:?}"); - Error::new(&code, &message) - })?; - - // Write the data to a temporary file first. - // If this fails, the original file will not be overwritten. - let tmp_path = self.directory.join(TMP_DIR).join(filename); - let file = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(&tmp_path)?; - - let writer = BufWriter::new(file); - bincode::serialize_into(writer, data)?; - - // If the serialization is successful, rename the temporary file. - fs::rename(&tmp_path, path)?; - Ok(()) - } } // This implementation block contains methods used by the gRPC server. @@ -131,13 +61,10 @@ impl Database { // Create the collection directory. let uuid = Uuid::new_v4().to_string(); - let collection_dir = self.directory.join(COLLECTIONS_DIR).join(uuid); - fs::create_dir(&collection_dir)?; + let collection_dir = self.dirs.collections_dir.join(uuid); - // Initialize the collection state. - let collection_state = CollectionState::default(); - let collection_state_file = collection_dir.join(COLLECTION_STATE_FILE); - self.write_binary_file(&collection_state, &collection_state_file)?; + // Initialize the collection. + Collection::open(collection_dir.to_path_buf())?; // Update the database state. state.collection_refs.insert(name.to_string(), collection_dir); @@ -170,6 +97,26 @@ impl Database { Ok(()) } + pub fn _add_fields( + &self, + collection_name: &str, + fields: impl Into, + ) -> Result<(), Error> { + let state = self.state.read()?; + let dir = match state.collection_refs.get(collection_name) { + Some(dir) => dir, + None => { + let code = ErrorCode::ClientError; + let message = format!("No collection name: {collection_name}"); + return Err(Error::new(&code, &message)); + } + }; + + let collection = Collection::open(dir.to_path_buf())?; + collection.add_fields(fields)?; + Ok(()) + } + fn validate_collection_name(name: &str) -> Result<(), Error> { if name.is_empty() { let code = ErrorCode::ClientError; @@ -188,3 +135,24 @@ impl Database { Ok(()) } } + +impl StateMachine for Database { + fn initialize_state(path: &PathBuf) -> Result { + let state = DatabaseState::default(); + FileOps::default().write_binary_file(path, &state)?; + Ok(state) + } + + fn read_state(path: &PathBuf) -> Result { + FileOps::default().read_binary_file(path) + } + + fn state(&self) -> Result { + Ok(self.state.read()?.clone()) + } + + fn persist_state(&self) -> Result<(), Error> { + let state = self.state.read()?.clone(); + FileOps::default().write_binary_file(&self.dirs.state_file, &state) + } +} diff --git a/src/db/database_service.rs b/src/db/database_service.rs index ee97450a..3cba8726 100644 --- a/src/db/database_service.rs +++ b/src/db/database_service.rs @@ -1,13 +1,12 @@ -use super::database::Database; use super::*; +use crate::proto; use crate::proto::database_server::Database as ProtoDatabase; -use crate::proto::*; #[tonic::async_trait] impl ProtoDatabase for Database { async fn create_collection( &self, - request: Request, + request: Request, ) -> Result, Status> { let request = request.into_inner(); self._create_collection(&request.name)?; @@ -16,10 +15,30 @@ impl ProtoDatabase for Database { async fn delete_collection( &self, - request: Request, + request: Request, ) -> Result, Status> { let request = request.into_inner(); self._delete_collection(&request.name)?; Ok(Response::new(())) } + + async fn add_fields( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + + // Construct Arrow fields from the request fields. + let mut fields = vec![]; + for field in request.fields { + // Use the MetadataType as a proxy to convert string to DataType. + let metadata_type: MetadataType = field.datatype.into(); + let datatype: DataType = metadata_type.into(); + let new_field = Field::new(&field.name, datatype, true); + fields.push(new_field); + } + + self._add_fields(&request.collection_name, fields)?; + Ok(Response::new(())) + } } diff --git a/src/db/mod.rs b/src/db/mod.rs index 44809478..e87120f5 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,15 +1,34 @@ -use crate::types::error::{Error, ErrorCode}; -use rayon::prelude::*; +use crate::types::*; +use arrow::datatypes::DataType; +use arrow_schema::{Field, Fields, Schema}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::fs::{self, OpenOptions}; -use std::io::{BufReader, BufWriter}; +use std::io::BufWriter; use std::path::PathBuf; use std::sync::{Arc, RwLock as Lock}; use tonic::{Request, Response, Status}; -pub mod collection; -pub mod database; -pub mod database_service; +mod collection; +mod database; +mod database_service; -use collection::*; +pub use collection::*; +pub use database::*; + +/// A trait for objects that own a state that should be persisted to disk. +/// - `T`: Type of the state object. +pub trait StateMachine { + /// Initializes the state object and persists it to a file. + /// This method should be called only once when the object is created. + fn initialize_state(path: &PathBuf) -> Result; + + /// Reads the state object from a file. + fn read_state(path: &PathBuf) -> Result; + + /// Returns a reference to the state object. + fn state(&self) -> Result; + + /// Persists the state object to a file. + fn persist_state(&self) -> Result<(), Error>; +} diff --git a/src/main.rs b/src/main.rs index f657d6c4..feb91e6c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,7 +7,7 @@ mod types; #[cfg(test)] mod tests; -use db::database::Database; +use db::*; use proto::database_server::DatabaseServer; use std::path::PathBuf; use tonic::transport::Server; @@ -19,7 +19,7 @@ const PORT: u16 = 2525; async fn main() -> Result<(), Box> { let addr = format!("{HOST}:{PORT}").parse()?; - let path = PathBuf::from("/tmp/oasysdb"); + let path = PathBuf::from("odb_data"); let database = Database::open(path)?; Server::builder() diff --git a/src/tests/mod.rs b/src/tests/mod.rs index cf36fb68..14a46758 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,12 +1,13 @@ -use crate::db::database::Database; -use crate::types::error::Error; +use crate::db::*; +use crate::types::*; +use arrow::datatypes::{DataType, Field}; use std::fs; use std::path::PathBuf; mod test_collection; mod test_database; -const TEST_DIR: &str = "/tmp/oasysdb"; +const TEST_DIR: &str = "odb_data"; const TEST_COLLECTION: &str = "collection"; fn create_test_database() -> Result { @@ -18,9 +19,20 @@ fn create_test_database() -> Result { // The database should have some subdirectories. let db = Database::open(path.clone())?; - let subdirs = path.read_dir()?; - assert!(subdirs.count() >= 3); + let content = path.read_dir()?; + assert!(content.count() == 2); + // Create a test collection. db._create_collection(TEST_COLLECTION)?; + Ok(db) } + +fn get_test_collection() -> Result { + let db = create_test_database()?; + let collection_refs = db.state()?.collection_refs; + + let directory = collection_refs[TEST_COLLECTION].to_path_buf(); + let collection = Collection::open(directory)?; + Ok(collection) +} diff --git a/src/tests/test_collection.rs b/src/tests/test_collection.rs index 8b898e44..db3859d8 100644 --- a/src/tests/test_collection.rs +++ b/src/tests/test_collection.rs @@ -1,23 +1,22 @@ use super::*; -use crate::db::collection::Collection; -use arrow::datatypes::{DataType, Field}; #[test] fn test_collection_new() -> Result<(), Error> { - let collection = Collection::new()?; + let collection = get_test_collection()?; assert_eq!(collection.state()?.count, 0); Ok(()) } #[test] fn test_collection_add_field() -> Result<(), Error> { - let collection = Collection::new()?; + let collection = get_test_collection()?; let field = Field::new("id", DataType::Utf8, false); collection.add_fields(vec![field])?; + // OasysDB has 2 default fields: internal_id and vector. let schema = collection.state()?.schema; - assert_eq!(schema.fields().len(), 1); - assert_eq!(schema.field(0).name(), "id"); + assert_eq!(schema.fields().len(), 3); + assert_eq!(schema.field(2).name(), "id"); Ok(()) } diff --git a/src/tests/test_database.rs b/src/tests/test_database.rs index abfed2eb..a770b4c3 100644 --- a/src/tests/test_database.rs +++ b/src/tests/test_database.rs @@ -1,7 +1,7 @@ use super::*; #[test] -fn test_create_collection() -> Result<(), Error> { +fn test_database_create_collection() -> Result<(), Error> { let db = create_test_database()?; let name = "new_collection"; db._create_collection(name)?; @@ -12,7 +12,7 @@ fn test_create_collection() -> Result<(), Error> { } #[test] -fn test_delete_collection() -> Result<(), Error> { +fn test_database_delete_collection() -> Result<(), Error> { let db = create_test_database()?; db._delete_collection(TEST_COLLECTION)?; @@ -20,3 +20,25 @@ fn test_delete_collection() -> Result<(), Error> { assert!(!state.collection_refs.contains_key(TEST_COLLECTION)); Ok(()) } + +#[test] +fn test_database_add_fields() -> Result<(), Error> { + let database = create_test_database()?; + + let state = database.state()?; + let dir = &state.collection_refs[TEST_COLLECTION]; + + // The collection has 2 default fields. + let collection = Collection::open(dir.clone())?; + assert!(collection.state()?.schema.fields().len() == 2); + + let field = Field::new("id", DataType::Utf8, false); + database._add_fields(TEST_COLLECTION, vec![field])?; + + // The collection should have 3 fields now. + let collection = Collection::open(dir.clone())?; + let schema = collection.state()?.schema; + assert!(schema.fields().len() == 3); + + Ok(()) +} diff --git a/src/types/error.rs b/src/types/error.rs index 91b3a002..42a62ace 100644 --- a/src/types/error.rs +++ b/src/types/error.rs @@ -12,6 +12,7 @@ use std::sync::PoisonError; pub enum ErrorCode { ArrowError, ClientError, + CollectionError, ConcurrencyError, FileError, SerializationError, diff --git a/src/types/file.rs b/src/types/file.rs new file mode 100644 index 00000000..b50adb7a --- /dev/null +++ b/src/types/file.rs @@ -0,0 +1,80 @@ +use super::error::{Error, ErrorCode}; +use serde::de::DeserializeOwned; +use serde::Serialize; +use std::env; +use std::fs::{self, OpenOptions}; +use std::io::{BufReader, BufWriter}; +use std::path::PathBuf; + +/// A utility struct for reading and writing files. +pub struct FileOps { + tmp_dir: PathBuf, +} + +impl Default for FileOps { + fn default() -> Self { + let tmp_dir = env::temp_dir().join("oasysdb"); + Self::new(tmp_dir) + } +} + +impl FileOps { + pub fn new(tmp_dir: PathBuf) -> Self { + if !tmp_dir.exists() { + fs::create_dir_all(&tmp_dir) + .expect("Unable to create a temporary directory.") + } + + Self { tmp_dir } + } + + /// Reads a binary file and deserialize it into a type. + pub fn read_binary_file( + &self, + path: &PathBuf, + ) -> Result { + let file = OpenOptions::new().read(true).open(path)?; + let reader = BufReader::new(file); + bincode::deserialize_from(reader).map_err(Into::into) + } + + /// Serializes a type and write it to a binary file. + /// + /// The file is written to a temporary file first, then renamed + /// to make sure that the file is not corrupted if the operation fails. + pub fn write_binary_file( + &self, + path: &PathBuf, + data: &T, + ) -> Result<(), Error> { + let filename = self.parse_file_name(path)?; + + // Write the data to a temporary file first. + // If this fails, the original file will not be overwritten. + let tmp_path = self.tmp_dir.join(filename); + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(&tmp_path)?; + + let writer = BufWriter::new(file); + bincode::serialize_into(writer, data)?; + + // If the serialization is successful, rename the temporary file. + fs::rename(&tmp_path, path)?; + Ok(()) + } + + /// Parses a file name from a path. + pub fn parse_file_name(&self, path: &PathBuf) -> Result { + path.file_name() + .and_then(|name| name.to_str()) + .map(|name| name.to_string()) + .ok_or_else(|| { + let code = ErrorCode::FileError; + let message = format!("Invalid file name from path: {path:?}"); + Error::new(&code, &message) + }) + } +} diff --git a/src/types/metadata.rs b/src/types/metadata.rs new file mode 100644 index 00000000..99f47fa5 --- /dev/null +++ b/src/types/metadata.rs @@ -0,0 +1,52 @@ +use arrow_schema::{DataType, Field}; + +/// Data types supported in OasysDB Arrow fields. +pub enum MetadataType { + Integer, + Float, + String, + Boolean, + Vector, +} + +// Available OasysDB data types in string form. +// This constant prevents typos in the code. +const INTEGER: &str = "integer"; +const FLOAT: &str = "float"; +const STRING: &str = "string"; +const BOOLEAN: &str = "boolean"; +const VECTOR: &str = "vector"; + +// Implement interoperability FROM and INTO other data types. + +impl From<&str> for MetadataType { + fn from(value: &str) -> Self { + match value { + INTEGER => MetadataType::Integer, + FLOAT => MetadataType::Float, + STRING => MetadataType::String, + BOOLEAN => MetadataType::Boolean, + VECTOR => MetadataType::Vector, + _ => panic!("Unsupported metadata type: {value}"), + } + } +} + +impl From for MetadataType { + fn from(value: String) -> Self { + MetadataType::from(value.as_str()) + } +} + +impl Into for MetadataType { + fn into(self) -> DataType { + let field_float = Field::new("element", DataType::Float32, false); + match self { + MetadataType::Integer => DataType::Int32, + MetadataType::Float => DataType::Float32, + MetadataType::String => DataType::Utf8, + MetadataType::Boolean => DataType::Boolean, + MetadataType::Vector => DataType::List(field_float.into()), + } + } +} diff --git a/src/types/mod.rs b/src/types/mod.rs index a91e7351..f42eb711 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1 +1,7 @@ -pub mod error; +mod error; +mod file; +mod metadata; + +pub use error::*; +pub use file::*; +pub use metadata::*; From 1799a6831b46df1b2a5aef4a43c9981ad31d6ccc Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Wed, 26 Jun 2024 13:23:44 -0500 Subject: [PATCH 29/88] feat: improve code based on linter --- src/db/collection.rs | 10 ++++++---- src/db/database.rs | 14 ++++++++------ src/db/mod.rs | 7 +++++-- src/types/metadata.rs | 6 +++--- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/db/collection.rs b/src/db/collection.rs index 3a513e93..b81be05c 100644 --- a/src/db/collection.rs +++ b/src/db/collection.rs @@ -110,14 +110,16 @@ impl Collection { } impl StateMachine for Collection { - fn initialize_state(path: &PathBuf) -> Result { + fn initialize_state( + path: impl Into, + ) -> Result { let state = CollectionState::new(); - FileOps::default().write_binary_file(path, &state)?; + FileOps::default().write_binary_file(&path.into(), &state)?; Ok(state) } - fn read_state(path: &PathBuf) -> Result { - FileOps::default().read_binary_file(path) + fn read_state(path: impl Into) -> Result { + FileOps::default().read_binary_file(&path.into()) } fn state(&self) -> Result { diff --git a/src/db/database.rs b/src/db/database.rs index e376b70f..793744df 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -34,9 +34,9 @@ impl Database { let state = if !state_file.try_exists()? { // Creating a collection directory will create the root directory. fs::create_dir_all(&dirs.collections_dir)?; - Self::initialize_state(&state_file)? + Self::initialize_state(state_file)? } else { - Self::read_state(&state_file)? + Self::read_state(state_file)? }; let state = Lock::new(state); @@ -137,14 +137,16 @@ impl Database { } impl StateMachine for Database { - fn initialize_state(path: &PathBuf) -> Result { + fn initialize_state( + path: impl Into, + ) -> Result { let state = DatabaseState::default(); - FileOps::default().write_binary_file(path, &state)?; + FileOps::default().write_binary_file(&path.into(), &state)?; Ok(state) } - fn read_state(path: &PathBuf) -> Result { - FileOps::default().read_binary_file(path) + fn read_state(path: impl Into) -> Result { + FileOps::default().read_binary_file(&path.into()) } fn state(&self) -> Result { diff --git a/src/db/mod.rs b/src/db/mod.rs index e87120f5..a36be43e 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -18,13 +18,16 @@ pub use database::*; /// A trait for objects that own a state that should be persisted to disk. /// - `T`: Type of the state object. +/// +/// Please refer to the implementation of the StateMachine trait for +/// Database and Collection for more details. pub trait StateMachine { /// Initializes the state object and persists it to a file. /// This method should be called only once when the object is created. - fn initialize_state(path: &PathBuf) -> Result; + fn initialize_state(path: impl Into) -> Result; /// Reads the state object from a file. - fn read_state(path: &PathBuf) -> Result; + fn read_state(path: impl Into) -> Result; /// Returns a reference to the state object. fn state(&self) -> Result; diff --git a/src/types/metadata.rs b/src/types/metadata.rs index 99f47fa5..0a4edfab 100644 --- a/src/types/metadata.rs +++ b/src/types/metadata.rs @@ -38,10 +38,10 @@ impl From for MetadataType { } } -impl Into for MetadataType { - fn into(self) -> DataType { +impl From for DataType { + fn from(value: MetadataType) -> Self { let field_float = Field::new("element", DataType::Float32, false); - match self { + match value { MetadataType::Integer => DataType::Int32, MetadataType::Float => DataType::Float32, MetadataType::String => DataType::Utf8, From 67a15d6678ccf3a35900f20a2e7dbb3c0229bbca Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Wed, 26 Jun 2024 15:18:29 -0500 Subject: [PATCH 30/88] feat: add remove fields rpc --- protos/database.proto | 7 ++++ src/db/collection.rs | 67 ++++++++++++++++++++++++++++++++++++ src/db/database.rs | 49 ++++++++++++-------------- src/db/database_service.rs | 9 +++++ src/tests/mod.rs | 15 +++----- src/tests/test_collection.rs | 22 ------------ src/tests/test_database.rs | 32 +++++++++++++---- 7 files changed, 135 insertions(+), 66 deletions(-) delete mode 100644 src/tests/test_collection.rs diff --git a/protos/database.proto b/protos/database.proto index 1a390ab5..6fc288da 100644 --- a/protos/database.proto +++ b/protos/database.proto @@ -8,6 +8,7 @@ service Database { rpc DeleteCollection(DeleteCollectionRequest) returns (google.protobuf.Empty); rpc AddFields(AddFieldsRequest) returns (google.protobuf.Empty); + rpc RemoveFields(RemoveFieldsRequest) returns (google.protobuf.Empty); } // region CreateCollection @@ -31,3 +32,9 @@ service Database { string collection_name = 1; repeated Field fields = 2; } + +// region RemoveFields + message RemoveFieldsRequest { + string collection_name = 1; + repeated string field_names = 2; + } diff --git a/src/db/collection.rs b/src/db/collection.rs index b81be05c..3c2b2da2 100644 --- a/src/db/collection.rs +++ b/src/db/collection.rs @@ -1,6 +1,7 @@ use super::*; use arrow::ipc::writer::FileWriter; use arrow::record_batch::RecordBatch; +use regex::Regex; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CollectionState { @@ -107,6 +108,72 @@ impl Collection { self.persist_state()?; Ok(()) } + + pub fn remove_fields(&self, field_names: &[String]) -> Result<(), Error> { + let mut state = self.state.write()?; + let schema = &state.schema; + + // Just like adding fields, removing fields from a non-empty + // collection is not supported in OasysDB. + if state.count > 0 { + let code = ErrorCode::ClientError; + let message = "Unable to remove fields from a non-empty collection"; + return Err(Error::new(&code, message)); + } + + // OasysDB has 2 default fields which can't be removed: + // internal_id and vector. + let default = ["internal_id", "vector"]; + if field_names.iter().any(|name| default.contains(&name.as_str())) { + let code = ErrorCode::ClientError; + let message = "Unable to remove default fields"; + return Err(Error::new(&code, message)); + } + + // Check if all the fields to be removed exist in the schema. + // Abort if any of the fields do not exist. + if field_names.iter().any(|name| schema.fields.find(name).is_none()) { + let code = ErrorCode::ClientError; + let message = "One or more fields do not exist in the schema."; + return Err(Error::new(&code, message)); + } + + let fields = schema + .all_fields() + .into_iter() + .filter(|field| !field_names.contains(field.name())) + .cloned() + .collect::>(); + + // Create a new schema without the specified fields. + let new_schema = Schema::new(fields); + + // Update the state and data. + state.schema = new_schema; + *state = state.clone(); + + drop(state); + self.persist_state()?; + Ok(()) + } + + pub fn validate_name(name: &str) -> Result<(), Error> { + if name.is_empty() { + let code = ErrorCode::ClientError; + let message = "Collection name cannot be empty"; + return Err(Error::new(&code, message)); + } + + let re = Regex::new(r"^[a-z_]+$").unwrap(); + if !re.is_match(name) { + let code = ErrorCode::ClientError; + let message = "Collection name must be lowercase letters \ + with underscores."; + return Err(Error::new(&code, message)); + } + + Ok(()) + } } impl StateMachine for Collection { diff --git a/src/db/database.rs b/src/db/database.rs index 793744df..be17eb48 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -1,5 +1,4 @@ use super::*; -use regex::Regex; use uuid::Uuid; #[derive(Debug, Clone, Serialize, Deserialize, Default)] @@ -49,7 +48,7 @@ impl Database { // We do this to make it easier to test the database logic. impl Database { pub fn _create_collection(&self, name: &str) -> Result<(), Error> { - Self::validate_collection_name(name)?; + Collection::validate_name(name)?; // Check if the collection already exists. let mut state = self.state.write()?; @@ -102,37 +101,33 @@ impl Database { collection_name: &str, fields: impl Into, ) -> Result<(), Error> { - let state = self.state.read()?; - let dir = match state.collection_refs.get(collection_name) { - Some(dir) => dir, - None => { - let code = ErrorCode::ClientError; - let message = format!("No collection name: {collection_name}"); - return Err(Error::new(&code, &message)); - } - }; - - let collection = Collection::open(dir.to_path_buf())?; + let dir = self.get_collection_dir(collection_name)?; + let collection = Collection::open(dir)?; collection.add_fields(fields)?; Ok(()) } - fn validate_collection_name(name: &str) -> Result<(), Error> { - if name.is_empty() { - let code = ErrorCode::ClientError; - let message = "Collection name cannot be empty"; - return Err(Error::new(&code, message)); - } + pub fn _remove_fields( + &self, + collection_name: &str, + field_names: &[String], + ) -> Result<(), Error> { + let dir = self.get_collection_dir(collection_name)?; + let collection = Collection::open(dir)?; + collection.remove_fields(field_names)?; + Ok(()) + } - let re = Regex::new(r"^[a-z_]+$").unwrap(); - if !re.is_match(name) { - let code = ErrorCode::ClientError; - let message = "Collection name must be lowercase letters \ - with underscores."; - return Err(Error::new(&code, message)); + fn get_collection_dir(&self, name: &str) -> Result { + let state = self.state.read()?; + match state.collection_refs.get(name) { + Some(dir) => Ok(dir.clone()), + None => { + let code = ErrorCode::ClientError; + let message = format!("No collection name: {name}"); + Err(Error::new(&code, &message)) + } } - - Ok(()) } } diff --git a/src/db/database_service.rs b/src/db/database_service.rs index 3cba8726..50f091c8 100644 --- a/src/db/database_service.rs +++ b/src/db/database_service.rs @@ -41,4 +41,13 @@ impl ProtoDatabase for Database { self._add_fields(&request.collection_name, fields)?; Ok(Response::new(())) } + + async fn remove_fields( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + self._remove_fields(&request.collection_name, &request.field_names)?; + Ok(Response::new(())) + } } diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 14a46758..42d9be86 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -4,7 +4,6 @@ use arrow::datatypes::{DataType, Field}; use std::fs; use std::path::PathBuf; -mod test_collection; mod test_database; const TEST_DIR: &str = "odb_data"; @@ -25,14 +24,10 @@ fn create_test_database() -> Result { // Create a test collection. db._create_collection(TEST_COLLECTION)?; - Ok(db) -} + // Add a couple of fields to the collection. + let field_title = Field::new("title", DataType::Utf8, true); + let field_year = Field::new("year", DataType::Int32, true); + db._add_fields(TEST_COLLECTION, vec![field_title, field_year])?; -fn get_test_collection() -> Result { - let db = create_test_database()?; - let collection_refs = db.state()?.collection_refs; - - let directory = collection_refs[TEST_COLLECTION].to_path_buf(); - let collection = Collection::open(directory)?; - Ok(collection) + Ok(db) } diff --git a/src/tests/test_collection.rs b/src/tests/test_collection.rs deleted file mode 100644 index db3859d8..00000000 --- a/src/tests/test_collection.rs +++ /dev/null @@ -1,22 +0,0 @@ -use super::*; - -#[test] -fn test_collection_new() -> Result<(), Error> { - let collection = get_test_collection()?; - assert_eq!(collection.state()?.count, 0); - Ok(()) -} - -#[test] -fn test_collection_add_field() -> Result<(), Error> { - let collection = get_test_collection()?; - let field = Field::new("id", DataType::Utf8, false); - collection.add_fields(vec![field])?; - - // OasysDB has 2 default fields: internal_id and vector. - let schema = collection.state()?.schema; - assert_eq!(schema.fields().len(), 3); - assert_eq!(schema.field(2).name(), "id"); - - Ok(()) -} diff --git a/src/tests/test_database.rs b/src/tests/test_database.rs index a770b4c3..41cb8745 100644 --- a/src/tests/test_database.rs +++ b/src/tests/test_database.rs @@ -24,21 +24,39 @@ fn test_database_delete_collection() -> Result<(), Error> { #[test] fn test_database_add_fields() -> Result<(), Error> { let database = create_test_database()?; - let state = database.state()?; let dir = &state.collection_refs[TEST_COLLECTION]; - // The collection has 2 default fields. - let collection = Collection::open(dir.clone())?; - assert!(collection.state()?.schema.fields().len() == 2); - let field = Field::new("id", DataType::Utf8, false); database._add_fields(TEST_COLLECTION, vec![field])?; - // The collection should have 3 fields now. let collection = Collection::open(dir.clone())?; let schema = collection.state()?.schema; - assert!(schema.fields().len() == 3); + assert!(schema.fields().find("id").is_some()); + + Ok(()) +} + +#[test] +#[should_panic] +fn test_database_remove_default_fields() { + let database = create_test_database().unwrap(); + let fields = ["internal_id".to_string()]; + database._remove_fields(TEST_COLLECTION, &fields).unwrap(); +} + +#[test] +fn test_database_remove_fields() -> Result<(), Error> { + let database = create_test_database()?; + let state = database.state()?; + let dir = &state.collection_refs[TEST_COLLECTION]; + + let fields = ["title".to_string()]; + database._remove_fields(TEST_COLLECTION, &fields)?; + + let collection = Collection::open(dir.clone())?; + let schema = collection.state()?.schema; + assert!(schema.fields().find("title").is_none()); Ok(()) } From b4786fe801764563496acbc17a1f822b370c0430 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Wed, 26 Jun 2024 19:12:23 -0500 Subject: [PATCH 31/88] style: add prosewrap settings to prettier --- .editorconfig | 1 + .github/ISSUE_TEMPLATE/bug_report.md | 9 +- .github/ISSUE_TEMPLATE/do_chore.md | 6 +- .github/ISSUE_TEMPLATE/feature_request.md | 10 +- .prettierrc.yml | 3 +- docs/blog/posts/overhauling_oasysdb.md | 47 ++++-- docs/changelog.md | 190 ++++++++++++++++------ docs/code_of_conduct.md | 9 +- docs/contributing.md | 97 ++++++++--- docs/migrations/0.4.5_to_0.5.0.md | 53 ++++-- docs/pull_request_template.md | 18 +- docs/security.md | 9 +- 12 files changed, 336 insertions(+), 116 deletions(-) diff --git a/.editorconfig b/.editorconfig index a89941d8..4907336c 100644 --- a/.editorconfig +++ b/.editorconfig @@ -5,6 +5,7 @@ charset = utf-8 indent_style = space insert_final_newline = true trim_trailing_whitespace = true +max_line_length = 80 [*.{rs, py}] indent_size = 4 diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 13e795f6..52751fc9 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -8,11 +8,13 @@ assignees: "" ### Short description -What is the bug about? Please provide a clear and concise description of what the bug is. If applicable, include the error message that you received. +What is the bug about? Please provide a clear and concise description of what +the bug is. If applicable, include the error message that you received. ### Steps to reproduce -How can the bug be reproduced? Please provide a minimal set of steps to reproduce the behavior. +How can the bug be reproduced? Please provide a minimal set of steps to +reproduce the behavior. Example: @@ -29,4 +31,5 @@ How severe is the bug? Are you using OasysDB in production? ### Additional context -Add any other context about the problem here. For example, screenshots, screen recordings, or logs. +Add any other context about the problem here. For example, screenshots, screen +recordings, or logs. diff --git a/.github/ISSUE_TEMPLATE/do_chore.md b/.github/ISSUE_TEMPLATE/do_chore.md index 1041f509..ede33ed3 100644 --- a/.github/ISSUE_TEMPLATE/do_chore.md +++ b/.github/ISSUE_TEMPLATE/do_chore.md @@ -8,7 +8,8 @@ assignees: "" ### Description -What is the chore about? Please provide a clear and concise description of what the chore is. +What is the chore about? Please provide a clear and concise description of what +the chore is. Chore examples: @@ -18,4 +19,5 @@ Chore examples: ### Why is this beneficial? -Why do you think this chore is needed? Is it related to a problem that you encounter that are not related to a bug or a feature request? +Why do you think this chore is needed? Is it related to a problem that you +encounter that are not related to a bug or a feature request? diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 8b8546e1..1381813a 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -8,14 +8,18 @@ assignees: "" ### Use case -What's the use case for this feature? Is it related to a problem that you encounter? Please provide a clear and concise description of what the feature is about. +What's the use case for this feature? Is it related to a problem that you +encounter? Please provide a clear and concise description of what the feature is +about. ### Proposed solution -On the high level, how would you like the feature to be implemented? Why do you think this is the best solution? +On the high level, how would you like the feature to be implemented? Why do you +think this is the best solution? Do you have any alternative solutions? ### Additional context -Add additional context about the feature request. Like, screenshots, links to related issues, or other relevant information. +Add additional context about the feature request. Like, screenshots, links to +related issues, or other relevant information. diff --git a/.prettierrc.yml b/.prettierrc.yml index ddcf0cad..1ab6bdbe 100644 --- a/.prettierrc.yml +++ b/.prettierrc.yml @@ -2,4 +2,5 @@ bracketSpacing: true singleQuote: false trailingComma: "none" semi: false -editorConfig: true +printWidth: 80 +proseWrap: "always" diff --git a/docs/blog/posts/overhauling_oasysdb.md b/docs/blog/posts/overhauling_oasysdb.md index 963427e7..26e905f3 100644 --- a/docs/blog/posts/overhauling_oasysdb.md +++ b/docs/blog/posts/overhauling_oasysdb.md @@ -10,20 +10,32 @@ categories: # DevLog #1: OasysDB Overhaul -OasysDB is a project that I started in January of this year, and honestly, it has been an incredible learning experience. With it, I've gained quite extensive experience in databases, machine learning algorithms, and low-level programming concepts. But, with this knowledge, I realize that the current design of OasysDB is not enough for production use. +OasysDB is a project that I started in January of this year, and honestly, it +has been an incredible learning experience. With it, I've gained quite extensive +experience in databases, machine learning algorithms, and low-level programming +concepts. But, with this knowledge, I realize that the current design of OasysDB +is not enough for production use. -After careful consideration, I've decided to rewrite OasysDB from the ground up. The new version will be designed to incorporate all the essential features needed for a production-ready vector database system. +After careful consideration, I've decided to rewrite OasysDB from the ground up. +The new version will be designed to incorporate all the essential features +needed for a production-ready vector database system. This includes, but is not limited to: -- Transitioning from an embedded to a client-server model for better scalability and isolation. -- Designing an efficient storage engine tailored for analytical production workloads. -- Implementing concurrent query processing to improve throughput and reduce latency. -- Utilizing advanced vector indexing algorithms for enhanced recall performance, especially in hybrid search scenarios. -- Incorporating an industry-standard query planner and optimizer to enhance query performance. -- Enhancing documentation and testing to ensure the system's robustness and reliability. +- Transitioning from an embedded to a client-server model for better scalability + and isolation. +- Designing an efficient storage engine tailored for analytical production + workloads. +- Implementing concurrent query processing to improve throughput and reduce + latency. +- Utilizing advanced vector indexing algorithms for enhanced recall performance, + especially in hybrid search scenarios. +- Incorporating an industry-standard query planner and optimizer to enhance + query performance. +- Enhancing documentation and testing to ensure the system's robustness and + reliability. Here's a high-level overview of the new architecture: @@ -31,12 +43,23 @@ Here's a high-level overview of the new architecture: ## Progress Update -Today, I started working on the new version of OasysDB. I've established the project structure, implemented the foundational data structures for the collection and storage engine, and set up the initial framework for client-server communication. +Today, I started working on the new version of OasysDB. I've established the +project structure, implemented the foundational data structures for the +collection and storage engine, and set up the initial framework for +client-server communication. -I will be posting regular updates (once or twice a week) on my progress, which may include in-depth explorations of the system's technical aspects. If you want to follow along with the development process, you can find the project on GitHub: [OasysDB](https://github.com/oasysai/oasysdb). +I will be posting regular updates (once or twice a week) on my progress, which +may include in-depth explorations of the system's technical aspects. If you want +to follow along with the development process, you can find the project on +GitHub: [OasysDB](https://github.com/oasysai/oasysdb). ## Conclusion -I'm really excited about the potential of the new OasysDB and the challenges that lie ahead. I believe this overhaul will lead to a robust and scalable vector database system perfect for a wide range of AI applications. +I'm really excited about the potential of the new OasysDB and the challenges +that lie ahead. I believe this overhaul will lead to a robust and scalable +vector database system perfect for a wide range of AI applications. -If you're into databases and AI, I encourage you to follow along with the development process as I share my insights, challenges, and victories in this DevLog series. If you have experience in this field, your feedback and suggestions would be greatly appreciated. +If you're into databases and AI, I encourage you to follow along with the +development process as I share my insights, challenges, and victories in this +DevLog series. If you have experience in this field, your feedback and +suggestions would be greatly appreciated. diff --git a/docs/changelog.md b/docs/changelog.md index 512a9be4..76abed82 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -4,9 +4,18 @@ ### What's Changed -- Add support for boolean metadata type. This allows full compatibility with JSON-like object or dictionary metadata when storing vector records in the collection. -- We optimize the database save and get collection operations performance by 10-20% by reducing the number of IO operations. Also, the save collection operation is now atomic which means that the collection is saved to the disk only when the operation is completed successfully. -- We launch our own documentation website at [docs.oasysdb.com](https://docs.oasysdb.com) to provide a better user experience and more comprehensive documentation for the OasysDB library. It's still a work in progress and we will continue to improve the documentation over time. +- Add support for boolean metadata type. This allows full compatibility with + JSON-like object or dictionary metadata when storing vector records in the + collection. +- We optimize the database save and get collection operations performance by + 10-20% by reducing the number of IO operations. Also, the save collection + operation is now atomic which means that the collection is saved to the disk + only when the operation is completed successfully. +- We launch our own documentation website at + [docs.oasysdb.com](https://docs.oasysdb.com) to provide a better user + experience and more comprehensive documentation for the OasysDB library. It's + still a work in progress and we will continue to improve the documentation + over time. ### Contributors @@ -20,15 +29,23 @@ ### What's Changed -- **CONDITIONAL BREAKING CHANGE**: We remove support for dot distance metric and we replace cosine similarity with cosine distance metric. This change is made to make the distance metric consistent with the other distance metrics. -- The default configuration for the collection (EF Construction and EF Search) is increased to a more sensible value according to the common real-world use cases. The default EF Construction is set to 128 and the default EF Search is set to 64. -- We add a new script to measure the recall rate of the collection search functionality. And with this, we improve the search recall rate of OasysDB to match the recall rate of HNSWLib with the same configuration. +- **CONDITIONAL BREAKING CHANGE**: We remove support for dot distance metric and + we replace cosine similarity with cosine distance metric. This change is made + to make the distance metric consistent with the other distance metrics. +- The default configuration for the collection (EF Construction and EF Search) + is increased to a more sensible value according to the common real-world use + cases. The default EF Construction is set to 128 and the default EF Search is + set to 64. +- We add a new script to measure the recall rate of the collection search + functionality. And with this, we improve the search recall rate of OasysDB to + match the recall rate of HNSWLib with the same configuration. ```sh cargo run --example measure-recall ``` -- We add a new benchmark to measure the performance of saving and getting the collection. The benchmark can be run by running the command below. +- We add a new benchmark to measure the performance of saving and getting the + collection. The benchmark can be run by running the command below. ```sh cargo bench @@ -46,16 +63,22 @@ cargo bench ### What's Changed -We add a new method `Collection.filter` to filter the vector records based on the metadata. This method returns a HashMap of the filtered vector records and their corresponding vector IDs. This implementation performs a linear search through the collection and thus might be slow for large datasets. +We add a new method `Collection.filter` to filter the vector records based on +the metadata. This method returns a HashMap of the filtered vector records and +their corresponding vector IDs. This implementation performs a linear search +through the collection and thus might be slow for large datasets. This implementation includes support for the following metadata to filter: - `String`: Stored value must include the filter string. - `Float`: Stored value must be equal to the filter float. - `Integer`: Stored value must be equal to the filter integer. -- `Object`: Stored value must match all the key-value pairs in the filter object. +- `Object`: Stored value must match all the key-value pairs in the filter + object. -We currently don't support filtering based on the array type metadata because I am not sure of the best way to implement it. If you have any suggestions, please let me know. +We currently don't support filtering based on the array type metadata because I +am not sure of the best way to implement it. If you have any suggestions, please +let me know. ### Contributors @@ -69,9 +92,16 @@ We currently don't support filtering based on the array type metadata because I ### What's Changed -- **BREAKING CHANGE**: Although there is no change in the database API, the underlying storage format has been changed to save the collection data to dedicated files directly. The details of the new persistent system and how to migrate from v0.4.x to v0.5.0 can be found in this [migration guide](migrations/0.4.5_to_0.5.0.md). +- **BREAKING CHANGE**: Although there is no change in the database API, the + underlying storage format has been changed to save the collection data to + dedicated files directly. The details of the new persistent system and how to + migrate from v0.4.x to v0.5.0 can be found in this + [migration guide](migrations/0.4.5_to_0.5.0.md). -- By adding the feature `gen`, you can now use the `EmbeddingModel` trait and OpenAI's embedding models to generate vectors or records from text without external dependencies. This feature is optional and can be enabled by adding the feature to the `Cargo.toml` file. +- By adding the feature `gen`, you can now use the `EmbeddingModel` trait and + OpenAI's embedding models to generate vectors or records from text without + external dependencies. This feature is optional and can be enabled by adding + the feature to the `Cargo.toml` file. ```toml [dependencies] @@ -90,8 +120,13 @@ oasysdb = { version = "0.5.0", features = ["gen"] } ### What's Changed -- Add insert benchmark to measure the performance of inserting vectors into the collection. The benchmark can be run using the `cargo bench` command. -- Fix the issue with large-size dirty IO buffers caused by the database operation. This issue is fixed by flushing the dirty IO buffers after the operation is completed. This operation can be done synchronously or asynchronously based on the user's preference since this operation might take some time to complete. +- Add insert benchmark to measure the performance of inserting vectors into the + collection. The benchmark can be run using the `cargo bench` command. +- Fix the issue with large-size dirty IO buffers caused by the database + operation. This issue is fixed by flushing the dirty IO buffers after the + operation is completed. This operation can be done synchronously or + asynchronously based on the user's preference since this operation might take + some time to complete. ### Contributors @@ -105,16 +140,26 @@ oasysdb = { version = "0.5.0", features = ["gen"] } ### What's Changed -- Maximize compatibility with the standard library error types to allow users to convert OasysDB errors to most commonly used error handling libraries such as `anyhow`, `thiserror`, etc. -- Add conversion methods to convert metadata to JSON value by `serde_json` and vice versa. This allows users to store JSON format metadata easily. -- Add normalized cosine distance metric to the collection search functionality. Read more about the normalized cosine distance metric here. -- Fix the search distance calculation to use the correct distance metric and sort it accordingly based on the collection configuration. -- Add vector ID utility methods to the `VectorID` struct to make it easier to work with the vector ID. +- Maximize compatibility with the standard library error types to allow users to + convert OasysDB errors to most commonly used error handling libraries such as + `anyhow`, `thiserror`, etc. +- Add conversion methods to convert metadata to JSON value by `serde_json` and + vice versa. This allows users to store JSON format metadata easily. +- Add normalized cosine distance metric to the collection search functionality. + Read more about the normalized cosine distance metric here. +- Fix the search distance calculation to use the correct distance metric and + sort it accordingly based on the collection configuration. +- Add vector ID utility methods to the `VectorID` struct to make it easier to + work with the vector ID. ### Additional Notes -- Add a new benchmark to measure the true search AKA brute-force search performance of the collection. If possible, dealing with a small dataset, it is recommended to use the true search method for better accuracy. The benchmark can be run using the `cargo bench` command. -- Improve the documentation to include more examples and explanations on how to use the library: Comprehensive Guide. +- Add a new benchmark to measure the true search AKA brute-force search + performance of the collection. If possible, dealing with a small dataset, it + is recommended to use the true search method for better accuracy. The + benchmark can be run using the `cargo bench` command. +- Improve the documentation to include more examples and explanations on how to + use the library: Comprehensive Guide. ### Contributors @@ -128,9 +173,15 @@ oasysdb = { version = "0.5.0", features = ["gen"] } ### What's Changed -- Add SIMD acceleration to calculate the distance between vectors. This improves the performance of inserting and searching vectors in the collection. -- Improve OasysDB native error type implementation to include the type/kind of error that occurred in addition to the error message. For example, `ErrorKind::CollectionError` is used to represent errors that occur during collection operations. -- Fix the `Config.ml` default value from 0.3 to 0.2885 which is the optimal value for the HNSW with M of 32. The optimal value formula for ml is `1/ln(M)`. +- Add SIMD acceleration to calculate the distance between vectors. This improves + the performance of inserting and searching vectors in the collection. +- Improve OasysDB native error type implementation to include the type/kind of + error that occurred in addition to the error message. For example, + `ErrorKind::CollectionError` is used to represent errors that occur during + collection operations. +- Fix the `Config.ml` default value from 0.3 to 0.2885 which is the optimal + value for the HNSW with M of 32. The optimal value formula for ml is + `1/ln(M)`. ### Contributors @@ -144,9 +195,14 @@ oasysdb = { version = "0.5.0", features = ["gen"] } ### What's Changed -Due to an issue (#62) with the Python release of v0.4.1, this patch version is released to fix the build wheels for Python users. The issue is caused due to the new optional PyO3 feature for the v0.4.1 Rust crate release which exclude PyO3 dependencies from the build process. To solve this, the Python package build and deploy script now includes `--features py` argument. +Due to an issue (#62) with the Python release of v0.4.1, this patch version is +released to fix the build wheels for Python users. The issue is caused due to +the new optional PyO3 feature for the v0.4.1 Rust crate release which exclude +PyO3 dependencies from the build process. To solve this, the Python package +build and deploy script now includes `--features py` argument. -For Rust users, this version doesn't offer any additional features or functionality compared to v0.4.1 release. +For Rust users, this version doesn't offer any additional features or +functionality compared to v0.4.1 release. ### Full Changelog @@ -157,9 +213,13 @@ For Rust users, this version doesn't offer any additional features or functional ### What's Changed - Added quality of life improvements to the `VectorID` type interoperability. -- Improved the `README.md` file with additional data points on the database performance. -- Changed to `Collection.insert` method to return the new `VectorID` after inserting a new vector record. -- Pyo3 dependencies are now hidden behind the `py` feature. This allows users to build the library without the Python bindings if they don't need it, which is probably all of them. +- Improved the `README.md` file with additional data points on the database + performance. +- Changed to `Collection.insert` method to return the new `VectorID` after + inserting a new vector record. +- Pyo3 dependencies are now hidden behind the `py` feature. This allows users to + build the library without the Python bindings if they don't need it, which is + probably all of them. ### Contributors @@ -175,7 +235,12 @@ For Rust users, this version doesn't offer any additional features or functional ### What's Changed -- **CONDITIONAL BREAKING CHANGE**: Add an option to configure distance for the vector collection via `Config` struct. The new field `distance` can be set using the `Distance` enum. This includes Euclidean, Cosine, and Dot distance metrics. The default distance metric is Euclidean. This change is backward compatible if you are creating a config using the `Config::default()` method. Otherwise, you need to update the config to include the distance metric. +- **CONDITIONAL BREAKING CHANGE**: Add an option to configure distance for the + vector collection via `Config` struct. The new field `distance` can be set + using the `Distance` enum. This includes Euclidean, Cosine, and Dot distance + metrics. The default distance metric is Euclidean. This change is backward + compatible if you are creating a config using the `Config::default()` method. + Otherwise, you need to update the config to include the distance metric. ```rs let config = Config { @@ -184,7 +249,11 @@ let config = Config { }; ``` -- With the new distance metric feature, now, you can set a `relevancy` threshold for the search results. This will filter out the results that are below or above the threshold depending on the distance metric used. This feature is disabled by default which is set to -1.0. To enable this feature, you can set the `relevancy` field in the `Collection` struct. +- With the new distance metric feature, now, you can set a `relevancy` threshold + for the search results. This will filter out the results that are below or + above the threshold depending on the distance metric used. This feature is + disabled by default which is set to -1.0. To enable this feature, you can set + the `relevancy` field in the `Collection` struct. ```rs ... @@ -192,7 +261,9 @@ let mut collection = Collection::new(&config)?; collection.relevancy = 3.0; ``` -- Add a new method `Collection::insert_many` to insert multiple vector records into the collection at once. This method is more optimized than using the `Collection::insert` method in a loop. +- Add a new method `Collection::insert_many` to insert multiple vector records + into the collection at once. This method is more optimized than using the + `Collection::insert` method in a loop. ### Contributors @@ -205,11 +276,17 @@ collection.relevancy = 3.0; ## v0.3.0 -This release introduces a BREAKING CHANGE to one of the method from the `Database` struct. The `Database::create_collection` method has been removed from the library due to redundancy. The `Database::save_collection` method can be used to create a new collection or update an existing one. This change is made to simplify the API and to make it more consistent with the other methods in the `Database` struct. +This release introduces a BREAKING CHANGE to one of the method from the +`Database` struct. The `Database::create_collection` method has been removed +from the library due to redundancy. The `Database::save_collection` method can +be used to create a new collection or update an existing one. This change is +made to simplify the API and to make it more consistent with the other methods +in the `Database` struct. ### What's Changed -- **BREAKING CHANGE**: Removed the `Database::create_collection` method from the library. To replace this, you can use the code snippet below: +- **BREAKING CHANGE**: Removed the `Database::create_collection` method from the + library. To replace this, you can use the code snippet below: ```rs // Before: this creates a new empty collection. @@ -221,8 +298,11 @@ let collection = Collection::build(&config, &records)?; db.save_collection("vectors", &collection)?; ``` -- Added the `Collection::list` method to list all the vector records in the collection. -- Created a full Python binding for OasysDB which is available on PyPI. This allows you to use OasysDB directly from Python. The Python binding is available at https://pypi.org/project/oasysdb. +- Added the `Collection::list` method to list all the vector records in the + collection. +- Created a full Python binding for OasysDB which is available on PyPI. This + allows you to use OasysDB directly from Python. The Python binding is + available at https://pypi.org/project/oasysdb. ### Contributors @@ -238,8 +318,12 @@ db.save_collection("vectors", &collection)?; ### What's Changed -- `Metadata` enum can now be accessed publicly using `oasysdb::metadata::Metadata`. This allows users to use `match` statements to extract the data from it. -- Added a `prelude` module that re-exports the most commonly used types and traits. This makes it easier to use the library by importing the prelude module by `use oasysdb::prelude::*`. +- `Metadata` enum can now be accessed publicly using + `oasysdb::metadata::Metadata`. This allows users to use `match` statements to + extract the data from it. +- Added a `prelude` module that re-exports the most commonly used types and + traits. This makes it easier to use the library by importing the prelude + module by `use oasysdb::prelude::*`. ### Contributors @@ -253,10 +337,20 @@ db.save_collection("vectors", &collection)?; ### What's Changed -- For `Collection` struct, the generic parameter `D` has been replaced with `Metadata` enum which allows one collection to store different types of data as needed. -- The `Vector` now uses `Vec` instead of `[f32, N]` which removes the `N` generic parameter from the `Vector` struct. Since there is a chance of using different vector dimensions in the same collection with this change, An additional functionality is added to the `Collection` to make sure that the vector dimension is uniform. -- The `M` generic parameter in the `Collection` struct has been replaced with a constant of 32. This removes the flexibility to tweak the indexing configuration for this value. But for most use cases, this value should be sufficient. -- Added multiple utility functions to structs such as `Record`, `Vector`, and `Collection` to make it easier to work with the data. +- For `Collection` struct, the generic parameter `D` has been replaced with + `Metadata` enum which allows one collection to store different types of data + as needed. +- The `Vector` now uses `Vec` instead of `[f32, N]` which removes the `N` + generic parameter from the `Vector` struct. Since there is a chance of using + different vector dimensions in the same collection with this change, An + additional functionality is added to the `Collection` to make sure that the + vector dimension is uniform. +- The `M` generic parameter in the `Collection` struct has been replaced with a + constant of 32. This removes the flexibility to tweak the indexing + configuration for this value. But for most use cases, this value should be + sufficient. +- Added multiple utility functions to structs such as `Record`, `Vector`, and + `Collection` to make it easier to work with the data. ### Contributors @@ -270,10 +364,14 @@ db.save_collection("vectors", &collection)?; ### What's Changed -- OasysDB release as an embedded vector database available directly via `cargo add oasysdb` command. -- Using HNSW algorithm implementation for the collection indexing along with Euclidean distance metrics. -- Incremental updates on the vector collections allowing inserts, deletes, and modifications without rebuilding the index. -- Add a benchmark on the collection search functionality using SIFT dataset that can be run using `cargo bench` command. +- OasysDB release as an embedded vector database available directly via + `cargo add oasysdb` command. +- Using HNSW algorithm implementation for the collection indexing along with + Euclidean distance metrics. +- Incremental updates on the vector collections allowing inserts, deletes, and + modifications without rebuilding the index. +- Add a benchmark on the collection search functionality using SIFT dataset that + can be run using `cargo bench` command. ### Contributors diff --git a/docs/code_of_conduct.md b/docs/code_of_conduct.md index 4d57c03a..66f76607 100644 --- a/docs/code_of_conduct.md +++ b/docs/code_of_conduct.md @@ -59,8 +59,9 @@ representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported to the community leaders responsible for enforcement at edwin@oasysai.com. -All complaints will be reviewed and investigated promptly and fairly. +reported to the community leaders responsible for enforcement at +edwin@oasysai.com. All complaints will be reviewed and investigated promptly and +fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. @@ -117,8 +118,8 @@ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.1, available at [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. -Community Impact Guidelines were inspired by -[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder][Mozilla CoC]. For answers to common questions about this code of conduct, see the FAQ at [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at diff --git a/docs/contributing.md b/docs/contributing.md index 8c16ec57..02d4e4a3 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -1,63 +1,108 @@ # Contributing to OasysDB -First of all, thank you for considering contributing to OasysDB! We welcome contributions from the community, and this document outlines the process for contributing to our project. +First of all, thank you for considering contributing to OasysDB! We welcome +contributions from the community, and this document outlines the process for +contributing to our project. ## Code of Conduct -We are committed to building an inclusive and welcoming community. We believe that it will lead to a more successful project and a better experience for everyone involved. To achieve that, any participant in our project is expected to act respectfully and to follow the [Code of Conduct](code_of_conduct.md). +We are committed to building an inclusive and welcoming community. We believe +that it will lead to a more successful project and a better experience for +everyone involved. To achieve that, any participant in our project is expected +to act respectfully and to follow the [Code of Conduct](code_of_conduct.md). ## Have questions or suggestions? [![Discord](https://img.shields.io/discord/1182432298382131200?logo=discord&logoColor=%23ffffff&label=Discord&labelColor=%235865F2&style=for-the-badge)](https://discord.gg/bDhQrkqNP4) -There is no such thing as a stupid question. If you have a question, chances are someone else does too. We encourage you to ask questions on our [Discord](https://discord.gg/bDhQrkqNP4) server. Alternatively, you can open a discussion on [GitHub Discussions](https://github.com/oasysai/oasysdb/discussions) with your question or suggestion. +There is no such thing as a stupid question. If you have a question, chances are +someone else does too. We encourage you to ask questions on our +[Discord](https://discord.gg/bDhQrkqNP4) server. Alternatively, you can open a +discussion on +[GitHub Discussions](https://github.com/oasysai/oasysdb/discussions) with your +question or suggestion. ## Encounter a bug? Have a feature request? -If you encounter a bug or have a feature request, please open an issue on [GitHub Issues](https://github.com/oasysai/oasysdb/issues). Please include as much information as possible in your issue. This includes: +If you encounter a bug or have a feature request, please open an issue on +[GitHub Issues](https://github.com/oasysai/oasysdb/issues). Please include as +much information as possible in your issue. This includes: - A description of the bug or feature request. -- If it's a bug, steps to reproduce the bug. If it's a feature request, include the use case and expected behavior of the feature. +- If it's a bug, steps to reproduce the bug. If it's a feature request, include + the use case and expected behavior of the feature. - Screenshots or screen recording, if applicable. ## Want to contribute code? -**TLDR: Check and open an issue first before forking the repository and submitting a pull request.** +**TLDR: Check and open an issue first before forking the repository and +submitting a pull request.** -Before you start working on a pull request, we encourage you to check out the existing issues and pull requests to make sure that -the feature you want to work on is in our roadmap and is aligned with the project's vision. After all, we don't want you to waste your precious time! +Before you start working on a pull request, we encourage you to check out the +existing issues and pull requests to make sure that the feature you want to work +on is in our roadmap and is aligned with the project's vision. After all, we +don't want you to waste your precious time! -We try to prioritize features and bug fixes that are on our roadmap or requested a lot by the community. If you want to work on a feature or bug fix that isn't already in the issue tracker, please open an issue first to discuss it with the community. +We try to prioritize features and bug fixes that are on our roadmap or requested +a lot by the community. If you want to work on a feature or bug fix that isn't +already in the issue tracker, please open an issue first to discuss it with the +community. -For features, we try to prioritize features that are backed by real-world use cases. If you have a use case for a feature, please include it in the issue. We'd love to hear about it! +For features, we try to prioritize features that are backed by real-world use +cases. If you have a use case for a feature, please include it in the issue. +We'd love to hear about it! ## Getting started Getting started with OasysDB development is pretty straightforward. -First, you will need to have Rust installed on your machine. We recommend using [rustup](https://www.rust-lang.org/tools/install) to install Rust. We also recommend having rust-analyzer installed for your code editor for a better development experience. - -OasysDB utilizes many third-party crates to provide its functionality. These are some of the most important ones and the resources you can use to learn more about them: - -- [**Apache Arrow**](https://arrow.apache.org): Arrow is a cross-language development platform for in-memory columnar data format for efficient analytic operations. -- [**Rayon**](https://github.com/rayon-rs/rayon): Rayon is a data parallelism library for Rust that provides a simple and efficient API for parallelizing computation. -- [**Tonic**](https://github.com/hyperium/tonic): Tonic is a gRPC over HTTP/2 implementation focused on high performance and flexibility built on top of the Tokio asynchronous runtime. +First, you will need to have Rust installed on your machine. We recommend using +[rustup](https://www.rust-lang.org/tools/install) to install Rust. We also +recommend having rust-analyzer installed for your code editor for a better +development experience. + +OasysDB utilizes many third-party crates to provide its functionality. These are +some of the most important ones and the resources you can use to learn more +about them: + +- [**Apache Arrow**](https://arrow.apache.org): Arrow is a cross-language + development platform for in-memory columnar data format for efficient analytic + operations. +- [**Rayon**](https://github.com/rayon-rs/rayon): Rayon is a data parallelism + library for Rust that provides a simple and efficient API for parallelizing + computation. +- [**Tonic**](https://github.com/hyperium/tonic): Tonic is a gRPC over HTTP/2 + implementation focused on high performance and flexibility built on top of the + Tokio asynchronous runtime. TODO: Complete the getting started guide. ## Style guide -We mostly use the default linting and style guide for Rust except for some linting changes listed in rustfmt.toml file. For more information about the code style, see the [Rust Style Guide](https://doc.rust-lang.org/beta/style-guide/index.html). +We mostly use the default linting and style guide for Rust except for some +linting changes listed in rustfmt.toml file. For more information about the code +style, see the +[Rust Style Guide](https://doc.rust-lang.org/beta/style-guide/index.html). -For commit messages, we use the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) format. This allows us to maintain consistency and readability in our Git commit history. +For commit messages, we use the +[Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) format. +This allows us to maintain consistency and readability in our Git commit +history. -When commenting your code, please try your best to write comments that are clear and concise with proper English sentence capitalization and punctuation. This will help us and the community understand your code better and keep the codebase maintainable. +When commenting your code, please try your best to write comments that are clear +and concise with proper English sentence capitalization and punctuation. This +will help us and the community understand your code better and keep the codebase +maintainable. ## Submitting a pull request -Once you have made your changes, you can submit a pull request. We will review your pull request and provide feedback. If your pull request is accepted, we will merge it into the main branch. +Once you have made your changes, you can submit a pull request. We will review +your pull request and provide feedback. If your pull request is accepted, we +will merge it into the main branch. -For organization purposes, we ask that you use the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) format for your pull request title in lowercase: +For organization purposes, we ask that you use the +[Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) format +for your pull request title in lowercase: ``` : @@ -72,7 +117,9 @@ fix: fix issue ... ## Conclusion -Thank you for taking the time to read this documentation. We look forward to your contributions! Another way to support this project is to star this project, share it with your circles, and join us on [Discord](https://discord.gg/bDhQrkqNP4). +Thank you for taking the time to read this documentation. We look forward to +your contributions! Another way to support this project is to star this project, +share it with your circles, and join us on +[Discord](https://discord.gg/bDhQrkqNP4). -Best regards,
-Edwin Kys +Best regards,
Edwin Kys diff --git a/docs/migrations/0.4.5_to_0.5.0.md b/docs/migrations/0.4.5_to_0.5.0.md index 607fefa4..b0d2bc3b 100644 --- a/docs/migrations/0.4.5_to_0.5.0.md +++ b/docs/migrations/0.4.5_to_0.5.0.md @@ -1,26 +1,46 @@ # Migrating from v0.4.5 to v0.5.0 -Due to the breaking changes introduced in v0.5.0 on the persistence system, you might need to update your codebase to make it compatible with the new version. This is not required if you are starting a new project from scratch. +Due to the breaking changes introduced in v0.5.0 on the persistence system, you +might need to update your codebase to make it compatible with the new version. +This is not required if you are starting a new project from scratch. ### What happened? -In v0.5.0, we introduced a new persistence system that is more optimized for rapidly changing data. Previously, we were using Sled to store the serialized collection blobs. We found that it was not the best option for our use case as each blob size could be somewhere in between 100MB to 10GB. +In v0.5.0, we introduced a new persistence system that is more optimized for +rapidly changing data. Previously, we were using Sled to store the serialized +collection blobs. We found that it was not the best option for our use case as +each blob size could be somewhere in between 100MB to 10GB. -When the data change rapidly, the collections need to be saved periodically to avoid data loss. With this, the collections need to be reserialized and rewritten back into Sled. The dirty IO buffer during these operations caused some storage issues, bloating the space required to store the collection for up to 100x the collection size. +When the data change rapidly, the collections need to be saved periodically to +avoid data loss. With this, the collections need to be reserialized and +rewritten back into Sled. The dirty IO buffer during these operations caused +some storage issues, bloating the space required to store the collection for up +to 100x the collection size. -This new system is more optimized for our use case since we now write the serialized collection data directly to a dedicated file on the disk. Now, we only use Sled for storing the collection metadata and the path to where the collection is stored. +This new system is more optimized for our use case since we now write the +serialized collection data directly to a dedicated file on the disk. Now, we +only use Sled for storing the collection metadata and the path to where the +collection is stored. ## How to migrate? -To migrate OasysDB from v0.4.5 to v0.5.0, I recommend creating a new Rust project and migrating the database from there. This migration project will read the data from the old database and write them to the new database. And for that, this project need to have access to the database files. +To migrate OasysDB from v0.4.5 to v0.5.0, I recommend creating a new Rust +project and migrating the database from there. This migration project will read +the data from the old database and write them to the new database. And for that, +this project need to have access to the database files. -If you are using OasysDB on Python, you might want to use Rust to migrate the database as it supports installing both versions of OasysDB on the same project easily which is required for the migration. I can promise you that the migration process is quite simple and straightforward. +If you are using OasysDB on Python, you might want to use Rust to migrate the +database as it supports installing both versions of OasysDB on the same project +easily which is required for the migration. I can promise you that the migration +process is quite simple and straightforward. -**Friendly Reminder**: Make sure to create a back-up of your database files before proceeding 😉 +**Friendly Reminder**: Make sure to create a back-up of your database files +before proceeding 😉 ### 1. Install both versions of OasysDB -After setting up the new project, you can install both versions of OasysDB by specifying the package and the version in the `Cargo.toml` file. +After setting up the new project, you can install both versions of OasysDB by +specifying the package and the version in the `Cargo.toml` file. ```toml [dependencies] @@ -30,7 +50,8 @@ odb5 = { package = "oasysdb", version = "0.5.0" } ### 2. Migrate the database -The following script will read the collections from the old database and write them to the new database which is all we need to do to migrate the database. +The following script will read the collections from the old database and write +them to the new database which is all we need to do to migrate the database. ```rust use odb4::prelude::Database; @@ -55,12 +76,20 @@ fn main() { ### 3. Verify the migration -After running the script, you can verify the migration by checking the new database files. The new database path should contain a sub-directory called `collections` which stores the serialized collection data. The number of files in this directory should be equal to the number of collections you migrated. +After running the script, you can verify the migration by checking the new +database files. The new database path should contain a sub-directory called +`collections` which stores the serialized collection data. The number of files +in this directory should be equal to the number of collections you migrated. -Don't forget to point your application to the new database path after the migration or rename the new database path to the old database path to make sure that your application uses the new database correctly. +Don't forget to point your application to the new database path after the +migration or rename the new database path to the old database path to make sure +that your application uses the new database correctly. ## Conclusion -If all the steps are followed correctly, you should have successfully migrated your OasysDB database from v0.4.5 to v0.5.0. If you face any issues during the migration, feel free to reach out to me on our [Discord](https://discord.gg/bDhQrkqNP4). +If all the steps are followed correctly, you should have successfully migrated +your OasysDB database from v0.4.5 to v0.5.0. If you face any issues during the +migration, feel free to reach out to me on our +[Discord](https://discord.gg/bDhQrkqNP4). I will be happy to personally assist you with the migration process 😁 diff --git a/docs/pull_request_template.md b/docs/pull_request_template.md index 2607dcab..7502d673 100644 --- a/docs/pull_request_template.md +++ b/docs/pull_request_template.md @@ -1,22 +1,28 @@ ### Purpose -Describe the problem solved or feature added by this PR. In addition, please provide a link to the issue this PR is related to, if applicable. +Describe the problem solved or feature added by this PR. In addition, please +provide a link to the issue this PR is related to, if applicable. ### Approach -How does this PR solve the problem or add the feature? What are the major changes in the codebase introduced by this PR? +How does this PR solve the problem or add the feature? What are the major +changes in the codebase introduced by this PR? ### Testing - [ ] I have tested this PR locally. -- [ ] I added tests to cover my changes, if not applicable, I have added a reason why. +- [ ] I added tests to cover my changes, if not applicable, I have added a + reason why. -How did you test this PR? Please provide a description of the tests that you ran to verify your changes. +How did you test this PR? Please provide a description of the tests that you ran +to verify your changes. -How should the reviewer test this PR? If applicable, have you added tests to cover your changes? +How should the reviewer test this PR? If applicable, have you added tests to +cover your changes? ### Chore checklist - [ ] I formatted my code according to the style and linter guidelines. - [ ] If applicable, I updated the documentation accordingly. -- [ ] I added comments to most of my code, particularly in hard-to-understand areas. +- [ ] I added comments to most of my code, particularly in hard-to-understand + areas. diff --git a/docs/security.md b/docs/security.md index fb6e003b..53cee893 100644 --- a/docs/security.md +++ b/docs/security.md @@ -1,12 +1,17 @@ # Security Policy -Thank you for taking the time to report a security issue. We are trying our best to make this project safe for everyone. We appreciate your efforts to disclose the issue responsibly and will make every effort to acknowledge your contributions. +Thank you for taking the time to report a security issue. We are trying our best +to make this project safe for everyone. We appreciate your efforts to disclose +the issue responsibly and will make every effort to acknowledge your +contributions. ## Reporting a vulnerability **Please do not report security vulnerabilities through public GitHub issues.** -If you believe you have found a security vulnerability, please send an email to edwin@oasysai.com. Please include as many details as possible, these may include: +If you believe you have found a security vulnerability, please send an email to +edwin@oasysai.com. Please include as many details as possible, these may +include: - Impact of the vulnerability. - Steps to reproduce. From 4a1c7ea37343bba759fd573f5007deeddd99fbe9 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Wed, 26 Jun 2024 19:14:26 -0500 Subject: [PATCH 32/88] docs(contributing): add link to grpc docs --- docs/contributing.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/contributing.md b/docs/contributing.md index 02d4e4a3..57bd3111 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -71,9 +71,9 @@ about them: - [**Rayon**](https://github.com/rayon-rs/rayon): Rayon is a data parallelism library for Rust that provides a simple and efficient API for parallelizing computation. -- [**Tonic**](https://github.com/hyperium/tonic): Tonic is a gRPC over HTTP/2 - implementation focused on high performance and flexibility built on top of the - Tokio asynchronous runtime. +- [**Tonic**](https://github.com/hyperium/tonic): Tonic is a + [gRPC](https://grpc.io/docs/) over HTTP/2 implementation focused on high + performance and flexibility built on top of the Tokio asynchronous runtime. TODO: Complete the getting started guide. From fd0949ff88bbd686cffbb9b32781168bc9f43826 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Thu, 27 Jun 2024 14:11:32 -0500 Subject: [PATCH 33/88] docs: add terms page --- docs/assets/style.css | 10 +++ docs/concepts/terms.md | 177 +++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 2 + 3 files changed, 189 insertions(+) create mode 100644 docs/concepts/terms.md diff --git a/docs/assets/style.css b/docs/assets/style.css index fafb2e66..d69c4758 100644 --- a/docs/assets/style.css +++ b/docs/assets/style.css @@ -3,3 +3,13 @@ h2, h3 { font-weight: bold !important; } + +/* Tables will be displayed at full width. */ + +.md-typeset__table { + width: 100%; +} + +.md-typeset__table table:not([class]) { + display: table; +} diff --git a/docs/concepts/terms.md b/docs/concepts/terms.md new file mode 100644 index 00000000..652d6d55 --- /dev/null +++ b/docs/concepts/terms.md @@ -0,0 +1,177 @@ +# Terms + +If you're new to RAG, vector search, and related concepts, this documentation +will guide you through the key terms and principles used in modern LLM-based +applications. + +This documentation attempts to provide a very high-level overview of the key +concepts and terms used in the LLM ecosystem. For a more in-depth understanding, +we recommend reading other dedicated resources. + +With that said, let's get started! + +## Embedding + +Embedding is a way to represent unstructured data as numbers to capture the +semantic meaning of the data. In the context of LLMs, embeddings are used to +represent words, sentences, or documents. + +Let's say we have a couple of words that we want to represent as numbers. For +simplicity, we will only consider 2 aspects of the words: edibility and +affordability. + +| Word | Edibility | Affordability | Label | +| ------ | --------- | ------------- | ------------ | +| Apple | 0.9 | 0.8 | Fruit | +| Apple | 0.0 | 0.0 | Tech Company | +| Banana | 0.8 | 0.8 | ? | + +In the table above, we can roughly deduce that the first apple is a fruit, while +the second apple refers to a tech company. If we were to deduce if the banana +here is a fruit or a tech company we never heard about, we could roughly say +that it's a fruit since it has similar edibility and affordability values as the +first apple. + +In practice, embeddings are much more complex and have many more dimensions, +often capturing various semantic properties beyond simple attributes like +edibility and affordability. For instance, embeddings in models like Word2Vec, +GloVe, BERT, or GPT-3 can have hundreds or thousands of dimensions. These +embeddings are learned by neural networks and are used in numerous applications, +such as search engines, recommendation systems, sentiment analysis, and machine +translation. + +Moreover, modern LLMs use contextual embeddings, meaning the representation of a +word depends on the context in which it appears. This allows the model to +distinguish between different meanings of the same word based on its usage in a +sentence. + +Note that embedding and vector are often used interchangeably in the context of +LLMs. + +## Indexing + +Indexing is the process of organizing and storing data to optimize search and +retrieval efficiency. In the context of RAG and vector search, indexing +organizes data based on their embeddings. + +Let's consider 4 data points below with their respective embeddings representing +features: alive and edible. + +| ID | Embedding | Data | +| --- | ---------- | ------ | +| 1 | [0.0, 0.8] | Apple | +| 2 | [0.0, 0.7] | Banana | +| 3 | [1.0, 0.4] | Dog | +| 4 | [0.0, 0.0] | BMW | + +To illustrate simple indexing, let's use a simplified version of the NSW +(Navigable Small World) algorithm. This algorithm establishes links between data +points based on the distances between their embeddings: + +``` +1 -> 2, 3 +2 -> 1, 3 +3 -> 2, 4 +4 -> 3, 2 +``` + +### ANNS + +ANNS is a technique for efficiently finding the nearest data points to a given +query, albeit approximately. While it may not always return the exact nearest +data points, ANNS provides results that are close enough. This probabilistic +approach balances accuracy with efficiency. + +Imagine we have a query with specific constraints: + +- Find the closest data to [0.0, 0.9]. +- Calculate a maximum of 2 distances using the Euclidean distance formula. + +Here's how we utilize the index created above to find the closest data point: + +1. We start at a random data point, say 4, which is linked to 3 and 2. +2. We calculate the distances and find that 2 is closer to [0.0, 0.9] than 3. +3. We determine that the closest data to [0.0, 0.9] is Banana. + +This method isn't perfect; in this case, the actual closest data point to [0.0, +0.9] is Apple. But, under these constraints, linear search would rely heavily on +chance to find the nearest data point. Indexing mitigates this issue by +efficiently narrowing down the search based on data embeddings. + +In real-world applications with millions of data points, linear search becomes +impractical. Indexing, however, enables swift retrieval by structuring data +intelligently according to their embeddings. + +Note that for managing billions of data points, sophisticated disk-based +indexing algorithms may be necessary to ensure efficient data handling. + +## RAG + +RAG (Retrieval-Augmented Generation) is a framework that combines information +retrieval and large language models (LLMs) to generate high-quality, +contextually relevant responses to user queries. This approach enhances the +capabilities of LLMs by incorporating relevant information retrieved from +external sources into the model's input. + +In practice, RAG works by retrieving relevant information from a vector +database, which allows efficient searching for the most relevant data based on +the user query. This retrieved information is then inserted into the input +context of the language model, providing it with additional knowledge to +generate more accurate and informative responses. + +Below is an example of a prompt with and without RAG in a simple Q&A scenario: + +=== "Without RAG" + + ```text + What is the name of my dog? + ``` + + > LLM: I don't know. + +=== "With RAG" + + ```text + Based on the context below: + I have a dog named Pluto. + + Answer the following question: What is the name of my dog? + ``` + + > LLM: The name of your dog is Pluto. + +By integrating retrieval with generation, RAG significantly improves the +performance of LLMs in tasks that require specific, up-to-date, or external +information, making it a powerful tool for various applications such as customer +support, knowledge management, and content generation. + +## Token + +A token is a unit of text that AI models use to process and understand natural +language. Tokens can be words, subwords, or characters, depending on the model's +architecture. Tokenization is a crucial preprocessing step in natural language +processing (NLP) and is essential for breaking down text into manageable pieces +that the model can process. + +In this example, we'll use `WordPunctTokenizer` from the NLTK library to +tokenize the sentence: "OasysDB is awesome." + +```py +from nltk.tokenize import WordPunctTokenizer + +tokenizer = WordPunctTokenizer() +tokens = tokenizer.tokenize("OasysDB is awesome.") +print(tokens) +``` + +```py +["OasysDB", "is", "awesome", "."] +``` + +Tokenization plays a big role in LLMs and embedding models. Understanding +tokenization can help in various aspects, such as optimizing model performance +and managing costs. + +Since many AI service providers charge based on the number of tokens processed. +So, you'll often encounter this term when working with LLMs and embedding +models, especially when determining the pricing of using a specific model. diff --git a/mkdocs.yml b/mkdocs.yml index 44738310..7a2ad4a6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -59,6 +59,7 @@ extra_css: nav: - Home: - Introduction: index.md + - Terms: concepts/terms.md - Other: - Changelog: changelog.md @@ -94,6 +95,7 @@ plugins: authors: true categories_allowed: - Log + - Rust exclude_docs: | pull_request_template.md From e0c7a199c0b994d67cec5fe895eac7492d7d6ebd Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 29 Jun 2024 19:00:28 -0500 Subject: [PATCH 34/88] feat: add insert records to collection --- Cargo.lock | 1 + Cargo.toml | 5 +- protos/database.proto | 32 +++++ src/db/collection.rs | 258 +++++++++++++++++++++++++++++-------- src/db/collection_utils.rs | 54 ++++++++ src/db/database.rs | 68 ++++++---- src/db/database_service.rs | 82 +++++++++++- src/db/mod.rs | 179 +++++++++++++++++++++++++ src/tests/mod.rs | 33 +++++ src/tests/test_database.rs | 28 ++++ src/types/error.rs | 7 +- src/types/file.rs | 67 ++++++++++ src/types/metadata.rs | 13 ++ 13 files changed, 741 insertions(+), 86 deletions(-) create mode 100644 src/db/collection_utils.rs diff --git a/Cargo.lock b/Cargo.lock index 96108f71..9ecce5c0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1061,6 +1061,7 @@ dependencies = [ "arrow-schema", "bincode", "prost", + "rand", "rayon", "regex", "serde", diff --git a/Cargo.toml b/Cargo.toml index 7e7083a3..032d179b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,11 +20,12 @@ version = "1.38.0" features = ["macros", "rt-multi-thread"] [dependencies] -arrow = "52.0.0" +uuid = { version = "1.9.1", features = ["v4", "rng"] } arrow-schema = { version = "52.0.0", features = ["serde"] } +arrow = "52.0.0" rayon = "1.10.0" regex = "1.10.5" -uuid = { version = "1.9.1", features = ["v4", "rng"] } +rand = "0.8.5" # gRPC stuff. prost = "0.12.6" diff --git a/protos/database.proto b/protos/database.proto index 6fc288da..0fb30e6d 100644 --- a/protos/database.proto +++ b/protos/database.proto @@ -9,6 +9,8 @@ service Database { rpc AddFields(AddFieldsRequest) returns (google.protobuf.Empty); rpc RemoveFields(RemoveFieldsRequest) returns (google.protobuf.Empty); + + rpc InsertRecords(InsertRecordsRequest) returns (google.protobuf.Empty); } // region CreateCollection @@ -38,3 +40,33 @@ service Database { string collection_name = 1; repeated string field_names = 2; } + +// region InsertRecords + message Record { + repeated Data data = 1; + } + + // The goal is to simulate a batch insert operation in SQL. + message InsertRecordsRequest { + string collection_name = 1; + repeated string field_names = 2; + repeated Record records = 3; + } + +// Custom reusable data types. + +message Vector { + repeated float values = 1; +} + +message Data { + // This value type should match the data type supported + // by OasysDB in the types/metadata.rs file + oneof value { + string string_value = 1; + int32 integer_value = 2; + bool boolean_value = 3; + float float_value = 4; + Vector vector_value = 5; + } +} diff --git a/src/db/collection.rs b/src/db/collection.rs index 3c2b2da2..2f1f04b7 100644 --- a/src/db/collection.rs +++ b/src/db/collection.rs @@ -1,44 +1,81 @@ use super::*; -use arrow::ipc::writer::FileWriter; -use arrow::record_batch::RecordBatch; -use regex::Regex; +use array::downcast_array; +use arrow::compute::concat_batches; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CollectionState { - pub schema: Schema, + pub batch_size: usize, pub count: usize, + pub dimension: usize, + pub schema: Schema, + pub dir: Directory, + /// Tracker of the next internal ID to assign to a record. + next_id: u32, } impl CollectionState { - fn new() -> Self { + fn new(dir: PathBuf) -> Result { let field_id = Field::new("internal_id", DataType::Int32, false); let vector_type = MetadataType::Vector.into(); let field_vector = Field::new("vector", vector_type, false); - // The default schema for a new collection contains two fields: - // internal_id and vector. - let schema = Schema::new(vec![field_id, field_vector]); - Self { schema, count: 0 } + let mut state = Self { + schema: Schema::new(vec![field_id, field_vector]), + dir: Directory::new(dir), + batch_size: 1000, + count: 0, + dimension: 0, + next_id: 1, + }; + + state.create_data_file()?; + Ok(state) + } + + fn create_data_file(&mut self) -> Result { + // The filename would be something like: cdata0000001. + let index = self.dir.data_files.len() + 1; + let filename = format!("cdata{index:0>7}"); + let data_file = self.dir.root.join(filename); + + let schema_ref = Arc::new(self.schema.clone()); + + // Create a new data file with an empty record batch. + + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(&data_file)?; + + let writer = BufWriter::new(file); + let mut file_writer = FileWriter::try_new(writer, &schema_ref)?; + + let record = RecordBatch::new_empty(schema_ref); + file_writer.write(&record)?; + file_writer.finish()?; + + self.dir.data_files.push(data_file.clone()); + Ok(data_file) } } -struct Directories { +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Directory { pub root: PathBuf, pub state_file: PathBuf, - pub data_file: PathBuf, + pub data_files: Vec, } -impl Directories { +impl Directory { fn new(root: PathBuf) -> Self { let state_file = root.join("cstate"); - let data_file = root.join("cdata"); - Self { root, state_file, data_file } + Self { root, state_file, data_files: vec![] } } } pub struct Collection { - dirs: Directories, state: Lock, } @@ -48,42 +85,18 @@ impl Collection { fs::create_dir_all(&dir)?; } - let dirs = Directories::new(dir); - let state = if !dirs.state_file.try_exists()? { - let state = Self::initialize_state(&dirs.state_file)?; - Self::initialize_data_file(&dirs.data_file, &state.schema)?; - state + let state_file = dir.join("cstate"); + let state = if !state_file.try_exists()? { + Self::initialize_state(&dir)? } else { - Self::read_state(&dirs.state_file)? + Self::read_state(&state_file)? }; let state = Lock::new(state); - let collection = Self { dirs, state }; + let collection = Self { state }; Ok(collection) } - /// Creates an empty data file for the collection. - /// This method should only be called once, when the collection is created. - fn initialize_data_file( - path: &PathBuf, - schema: &Schema, - ) -> Result<(), Error> { - let file = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(path)?; - - let writer = BufWriter::new(file); - let mut file_writer = FileWriter::try_new(writer, schema)?; - - let record = RecordBatch::new_empty(Arc::new(schema.clone())); - file_writer.write(&record)?; - - file_writer.finish()?; - Ok(()) - } - pub fn add_fields(&self, fields: impl Into) -> Result<(), Error> { let mut state = self.state.write()?; @@ -157,31 +170,162 @@ impl Collection { Ok(()) } - pub fn validate_name(name: &str) -> Result<(), Error> { - if name.is_empty() { + pub fn insert_records( + &self, + field_names: &[String], + records: &[Arc], + ) -> Result<(), Error> { + let mut state = self.state.write()?; + + let mut record_map: HashMap> = field_names + .iter() + .enumerate() + .map(|(i, name)| (name.clone(), records[i].clone())) + .collect(); + + // It's safe to unwrap here because the vector field has been checked in + // the database service before calling this method. + let vector_array = record_map.get("vector").unwrap(); + + let data_size = vector_array.len(); + let dimension = { + let array: ListArray = downcast_array(vector_array.as_ref()); + let vector: Float32Array = downcast_array(array.value(0).as_ref()); + vector.len() + }; + + if dimension == 0 { let code = ErrorCode::ClientError; - let message = "Collection name cannot be empty"; + let message = "Vector cannot be empty"; return Err(Error::new(&code, message)); } - let re = Regex::new(r"^[a-z_]+$").unwrap(); - if !re.is_match(name) { - let code = ErrorCode::ClientError; - let message = "Collection name must be lowercase letters \ - with underscores."; - return Err(Error::new(&code, message)); + // If it's the first record, we need to update the dimension. + if state.count == 0 && state.dimension == 0 { + state.dimension = dimension; + } + + // Ensure all vectors have the same dimension. + self.validate_vectors(vector_array, dimension)?; + + let schema = state.schema.clone(); + let fields = schema.all_fields(); + + // Create a column array for internal_id. + let internal_id: Vec> = (state.next_id..) + .take(data_size) + .map(|id| Some(id as i32)) + .collect(); + let internal_id_array = Arc::new(Int32Array::from(internal_id)); + + record_map.insert("internal_id".to_string(), internal_id_array); + + // Check for missing fields in the record and create a + // column array for each missing field with null values. + // This is necessary to ensure that all fields are present. + let create_missing_array = |field: &Field| { + let data_type = field.data_type().clone().into(); + let array = match data_type { + MetadataType::Integer => Int32Array::null_array(data_size), + MetadataType::Float => Float32Array::null_array(data_size), + MetadataType::String => StringArray::null_array(data_size), + MetadataType::Boolean => BooleanArray::null_array(data_size), + MetadataType::Vector => ListArray::null_array(data_size), + }; + + (field.name().to_string(), array as Arc) + }; + + let missing_fields: HashMap> = fields + .into_iter() + .filter(|field| !record_map.contains_key(field.name())) + .map(create_missing_array) + .collect(); + + // Merge the missing fields with the record map. + record_map.extend(missing_fields); + + // Convert the record map to columns in order based on the schema. + let extract_array = |field: &Arc| { + let name = field.name(); + let array = record_map.get(name).unwrap(); + array.clone() + }; + + let columns = schema.fields.iter().map(extract_array).collect(); + + // Create a record batch from the record map. + let schemaref = Arc::new(schema.clone()); + let record_batch = RecordBatch::try_new(schemaref.clone(), columns)?; + + // OasysDB limits the number of record batches in a data file to 1. + // Per record batch, there can be a maximum of 1000 records by default. + + // The behavior is as follows: + // 1. If the last data file is empty, write the record batch to it. + // 2. If the last data file is not empty, combine the last record batch + // with the new record batch and write the combined record batch to + // the last data file until it reaches the batch size. + + let data_files = &mut state.dir.data_files; + let file_ops = FileOps::default(); + + // Also, we can unwrap here because the data files won't be None. + let last_data_file = data_files.last().unwrap(); + let last_record_batch = file_ops.read_ipc_file(last_data_file)?; + + let record_batch = if last_record_batch.num_rows() != 0 { + let batches = vec![&last_record_batch, &record_batch]; + concat_batches(&schemaref, batches)? + } else { + record_batch + }; + + let mut files_to_write = vec![last_data_file.clone()]; + + // This determines the number of new files to create. + // Let's say the batch size is 1000 and the combined record batch + // has 1500 records. This means we need to create 1 new file because + // the first 1000 records will be written to the last data file and + // the remaining 500 records will be written to the new file. + let num_new_file = { + let size = record_batch.num_rows(); + let remain = size.saturating_sub(state.batch_size) as f32; + let div = remain / state.batch_size as f32; + div.ceil() as usize + }; + + for _ in 0..num_new_file { + let data_file = state.create_data_file()?; + files_to_write.push(data_file); } + FileOps::default().write_ipc_files( + &files_to_write, + &record_batch, + state.batch_size, + )?; + + // Update and persist the state. + state.count += data_size; + state.next_id += data_size as u32; + *state = state.clone(); + + // Drop the state lock before persisting the state. + // This prevents deadlocks since persist_state also requires the lock. + drop(state); + self.persist_state()?; + Ok(()) } } impl StateMachine for Collection { fn initialize_state( - path: impl Into, + root: impl Into, ) -> Result { - let state = CollectionState::new(); - FileOps::default().write_binary_file(&path.into(), &state)?; + let state = CollectionState::new(root.into())?; + FileOps::default().write_binary_file(&state.dir.state_file, &state)?; Ok(state) } @@ -196,6 +340,6 @@ impl StateMachine for Collection { fn persist_state(&self) -> Result<(), Error> { let state = self.state.read()?.clone(); let file_ops = FileOps::default(); - file_ops.write_binary_file(&self.dirs.state_file, &state) + file_ops.write_binary_file(&state.dir.state_file, &state) } } diff --git a/src/db/collection_utils.rs b/src/db/collection_utils.rs new file mode 100644 index 00000000..638bd8ce --- /dev/null +++ b/src/db/collection_utils.rs @@ -0,0 +1,54 @@ +use super::*; +use array::downcast_array; +use regex::Regex; + +impl Collection { + /// Validates the name of collections or fields. + pub fn validate_name(name: &str) -> Result<(), Error> { + if name.is_empty() { + let code = ErrorCode::ClientError; + let message = "Name cannot be empty"; + return Err(Error::new(&code, message)); + } + + // We only allow lowercase letters and underscores in the names. + // Also, we can unwrap here because the regex pattern is hardcoded. + let re = Regex::new(r"^[a-z_]+$").unwrap(); + if !re.is_match(name) { + return Err(Error::new( + &ErrorCode::ClientError, + "Name must be lowercase letters with underscores.", + )); + } + + Ok(()) + } + + /// Validates the vectors given a column array consisting of vectors. + /// This ensures that all vectors provided have the same dimension. + pub fn validate_vectors( + &self, + vectors: &Arc, + dimension: usize, + ) -> Result<(), Error> { + let vector_array: ListArray = downcast_array(vectors.as_ref()); + + let is_dimension_mismatch = |array: Arc| { + let vector: Float32Array = downcast_array(array.as_ref()); + vector.len() != dimension + }; + + let dimension_mismatch = vector_array.iter().any(|array| match array { + Some(array) => is_dimension_mismatch(array), + None => true, + }); + + if dimension_mismatch { + let code = ErrorCode::ClientError; + let message = "Vectors must have the same dimension."; + return Err(Error::new(&code, message)); + } + + Ok(()) + } +} diff --git a/src/db/database.rs b/src/db/database.rs index be17eb48..ea6f4c38 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -6,13 +6,13 @@ pub struct DatabaseState { pub collection_refs: HashMap, } -struct Directories { +struct Directory { pub root: PathBuf, pub collections_dir: PathBuf, pub state_file: PathBuf, } -impl Directories { +impl Directory { fn new(root: PathBuf) -> Self { let collections_dir = root.join("collections"); let state_file = root.join("dbstate"); @@ -21,25 +21,25 @@ impl Directories { } pub struct Database { - dirs: Directories, + dir: Directory, state: Lock, } impl Database { pub fn open(dir: PathBuf) -> Result { - let dirs = Directories::new(dir); + let dir = Directory::new(dir); - let state_file = &dirs.state_file; + let state_file = &dir.state_file; let state = if !state_file.try_exists()? { // Creating a collection directory will create the root directory. - fs::create_dir_all(&dirs.collections_dir)?; + fs::create_dir_all(&dir.collections_dir)?; Self::initialize_state(state_file)? } else { Self::read_state(state_file)? }; let state = Lock::new(state); - let db = Self { dirs, state }; + let db = Self { dir, state }; Ok(db) } } @@ -60,7 +60,7 @@ impl Database { // Create the collection directory. let uuid = Uuid::new_v4().to_string(); - let collection_dir = self.dirs.collections_dir.join(uuid); + let collection_dir = self.dir.collections_dir.join(uuid); // Initialize the collection. Collection::open(collection_dir.to_path_buf())?; @@ -77,8 +77,33 @@ impl Database { Ok(()) } + pub fn _get_collection(&self, name: &str) -> Result { + let state = self.state.read()?; + + if name.is_empty() { + let code = ErrorCode::ClientError; + let message = "Collection name cannot be empty"; + return Err(Error::new(&code, message)); + } + + // Get the directory where the collection is + // persisted from the database state. + let dir = match state.collection_refs.get(name) { + Some(dir) => dir.clone(), + None => { + let code = ErrorCode::NotFoundError; + let message = format!("Collection not found: {name}"); + return Err(Error::new(&code, &message)); + } + }; + + Collection::open(dir) + } + pub fn _delete_collection(&self, name: &str) -> Result<(), Error> { let mut state = self.state.write()?; + + // This makes the method idempotent. if !state.collection_refs.contains_key(name) { return Ok(()); } @@ -101,8 +126,7 @@ impl Database { collection_name: &str, fields: impl Into, ) -> Result<(), Error> { - let dir = self.get_collection_dir(collection_name)?; - let collection = Collection::open(dir)?; + let collection = self._get_collection(collection_name)?; collection.add_fields(fields)?; Ok(()) } @@ -112,22 +136,20 @@ impl Database { collection_name: &str, field_names: &[String], ) -> Result<(), Error> { - let dir = self.get_collection_dir(collection_name)?; - let collection = Collection::open(dir)?; + let collection = self._get_collection(collection_name)?; collection.remove_fields(field_names)?; Ok(()) } - fn get_collection_dir(&self, name: &str) -> Result { - let state = self.state.read()?; - match state.collection_refs.get(name) { - Some(dir) => Ok(dir.clone()), - None => { - let code = ErrorCode::ClientError; - let message = format!("No collection name: {name}"); - Err(Error::new(&code, &message)) - } - } + pub fn _insert_records( + &self, + collection_name: &str, + field_names: &[String], + records: &[Arc], + ) -> Result<(), Error> { + let collection = self._get_collection(collection_name)?; + collection.insert_records(field_names, records)?; + Ok(()) } } @@ -150,6 +172,6 @@ impl StateMachine for Database { fn persist_state(&self) -> Result<(), Error> { let state = self.state.read()?.clone(); - FileOps::default().write_binary_file(&self.dirs.state_file, &state) + FileOps::default().write_binary_file(&self.dir.state_file, &state) } } diff --git a/src/db/database_service.rs b/src/db/database_service.rs index 50f091c8..e8c07d26 100644 --- a/src/db/database_service.rs +++ b/src/db/database_service.rs @@ -1,6 +1,5 @@ use super::*; -use crate::proto; -use crate::proto::database_server::Database as ProtoDatabase; +use proto::database_server::Database as ProtoDatabase; #[tonic::async_trait] impl ProtoDatabase for Database { @@ -31,9 +30,12 @@ impl ProtoDatabase for Database { // Construct Arrow fields from the request fields. let mut fields = vec![]; for field in request.fields { + Collection::validate_name(&field.name)?; + // Use the MetadataType as a proxy to convert string to DataType. let metadata_type: MetadataType = field.datatype.into(); let datatype: DataType = metadata_type.into(); + let new_field = Field::new(&field.name, datatype, true); fields.push(new_field); } @@ -50,4 +52,80 @@ impl ProtoDatabase for Database { self._remove_fields(&request.collection_name, &request.field_names)?; Ok(Response::new(())) } + + async fn insert_records( + &self, + request: Request, + ) -> Result, Status> { + let proto::InsertRecordsRequest { + collection_name, + field_names, + records, + } = request.into_inner(); + + if field_names.is_empty() { + return Err(Status::invalid_argument( + "At least one field name must be specified.", + )); + } + + if !field_names.contains(&"vector".to_string()) { + return Err(Status::invalid_argument( + "The vector field must be specified.", + )); + } + + // Check if the records provided match the number of fields. + // This is required since we try to simulate a batch insert like: + // INSERT INTO collection_name (field1, field2) + // VALUES + // (x1, y1), + // (x2, y2, z2) <- We should catch this error. + if records + .par_iter() + .any(|record| record.data.len() != field_names.len()) + { + let message = "The number of values must match the fields."; + return Err(Status::invalid_argument(message)); + } + + let collection = self._get_collection(&collection_name)?; + let schema = collection.state()?.schema; + let fields = schema.fields; + + // Check if the fields specified in the request exist in the schema. + if field_names.par_iter().any(|name| fields.find(name).is_none()) { + return Err(Status::invalid_argument( + "One or more fields specified do not exist in the schema.", + )); + } + + // Convert records from row format to column format. + let mut columns = vec![vec![]; field_names.len()]; + for record in records { + for i in 0..field_names.len() { + let value = record.data[i].value.clone(); + columns[i].push(value); + } + } + + // Convert columns to Arrow arrays. + let mut arrays = vec![]; + for i in 0..field_names.len() { + let field = fields.find(&field_names[i]).unwrap().1; + let column = columns[i].clone(); + let array = match field.data_type().clone().into() { + MetadataType::Boolean => BooleanArray::from_values(column)?, + MetadataType::Integer => Int32Array::from_values(column)?, + MetadataType::Float => Float32Array::from_values(column)?, + MetadataType::String => StringArray::from_values(column)?, + MetadataType::Vector => ListArray::from_values(column)?, + }; + + arrays.push(array); + } + + self._insert_records(&collection_name, &field_names, &arrays)?; + Ok(Response::new(())) + } } diff --git a/src/db/mod.rs b/src/db/mod.rs index a36be43e..8b7b2ff5 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,6 +1,12 @@ +use crate::proto; use crate::types::*; +use array::{BooleanArray, Float32Array, Int32Array, ListArray, StringArray}; +use arrow::array::{self, Array}; use arrow::datatypes::DataType; +use arrow::ipc::writer::FileWriter; +use arrow::record_batch::RecordBatch; use arrow_schema::{Field, Fields, Schema}; +use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::fs::{self, OpenOptions}; @@ -10,12 +16,15 @@ use std::sync::{Arc, RwLock as Lock}; use tonic::{Request, Response, Status}; mod collection; +mod collection_utils; mod database; mod database_service; pub use collection::*; pub use database::*; +type ProtoValue = proto::data::Value; + /// A trait for objects that own a state that should be persisted to disk. /// - `T`: Type of the state object. /// @@ -35,3 +44,173 @@ pub trait StateMachine { /// Persists the state object to a file. fn persist_state(&self) -> Result<(), Error>; } + +pub trait ArrayUtils { + fn from_values( + values: Vec>, + ) -> Result, Error>; + + /// Creates an array filled with null values. + fn null_array(len: usize) -> Arc; +} + +pub trait ListArrayUtils { + fn from_vectors(values: Vec>) -> Arc; +} + +impl ArrayUtils for BooleanArray { + fn from_values( + values: Vec>, + ) -> Result, Error> { + let parse_boolean = |value: Option| match value { + Some(ProtoValue::BooleanValue(value)) => Some(value), + _ => None, + }; + + let values: Vec> = + values.into_par_iter().map(parse_boolean).collect(); + Ok(Arc::new(BooleanArray::from(values))) + } + + fn null_array(len: usize) -> Arc { + Arc::new(BooleanArray::from(vec![None; len])) + } +} + +impl ArrayUtils for Float32Array { + fn from_values( + values: Vec>, + ) -> Result, Error> { + let parse_float = |value: Option| match value { + Some(ProtoValue::FloatValue(value)) => Some(value), + _ => None, + }; + + let values: Vec> = + values.into_par_iter().map(parse_float).collect(); + Ok(Arc::new(Float32Array::from(values))) + } + + fn null_array(len: usize) -> Arc { + Arc::new(Float32Array::from(vec![None; len])) + } +} + +impl ArrayUtils for Int32Array { + fn from_values( + values: Vec>, + ) -> Result, Error> { + let parse_int = |value: Option| match value { + Some(ProtoValue::IntegerValue(value)) => Some(value), + _ => None, + }; + + let values: Vec> = + values.into_par_iter().map(parse_int).collect(); + Ok(Arc::new(Int32Array::from(values))) + } + + fn null_array(len: usize) -> Arc { + Arc::new(Int32Array::from(vec![None; len])) + } +} + +impl ArrayUtils for StringArray { + fn from_values( + values: Vec>, + ) -> Result, Error> { + let parse_string = |value: Option| match value { + Some(ProtoValue::StringValue(value)) => Some(value), + _ => None, + }; + + let values: Vec> = + values.into_par_iter().map(parse_string).collect(); + Ok(Arc::new(StringArray::from(values))) + } + + fn null_array(len: usize) -> Arc { + let source: Vec> = vec![None; len]; + Arc::new(StringArray::from(source)) + } +} + +impl ArrayUtils for ListArray { + fn from_values( + values: Vec>, + ) -> Result, Error> { + let parse_vector = |value: Option| match value { + Some(ProtoValue::VectorValue(value)) => Some(value.values), + _ => None, + }; + + let values: Vec>> = + values.into_par_iter().map(parse_vector).collect(); + + // Find the dimension of the vector. + let dimension = values + .clone() + .into_par_iter() + .map(|value| value.unwrap_or(vec![]).len()) + .max() + // 1024 is the default capacity for generic array builders. + .unwrap_or(1024); + + // Create builders to construct the ListArray. + let mut list_builder = { + let float_builder = Float32Array::builder(dimension); + let field = Field::new("element", DataType::Float32, false); + array::ListBuilder::new(float_builder).with_field(field) + }; + + // Insert values into the builder. + for value in values { + match value { + Some(values) => { + list_builder.values().append_slice(&values); + list_builder.append(true); + } + None => list_builder.append(false), + } + } + + let array = list_builder.finish(); + Ok(Arc::new(array)) + } + + fn null_array(len: usize) -> Arc { + let mut builder = { + // We can use 0 capacity since we are not going to append any values. + let float_builder = Float32Array::builder(0); + let field = Field::new("element", DataType::Float32, false); + array::ListBuilder::new(float_builder).with_field(field) + }; + + for _ in 0..len { + builder.append(false); + } + + let array = builder.finish(); + Arc::new(array) + } +} + +impl ListArrayUtils for ListArray { + fn from_vectors(values: Vec>) -> Arc { + let dimension = values[0].len(); + + let mut list_builder = { + let float_builder = Float32Array::builder(dimension); + let field = Field::new("element", DataType::Float32, false); + array::ListBuilder::new(float_builder).with_field(field) + }; + + for value in values { + list_builder.values().append_slice(&value); + list_builder.append(true); + } + + let array = list_builder.finish(); + Arc::new(array) + } +} diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 42d9be86..0313c85d 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,8 +1,11 @@ use crate::db::*; use crate::types::*; +use arrow::array::{self, Array}; use arrow::datatypes::{DataType, Field}; +use rand::random; use std::fs; use std::path::PathBuf; +use std::sync::Arc; mod test_database; @@ -31,3 +34,33 @@ fn create_test_database() -> Result { Ok(db) } + +fn create_test_database_with_data() -> Result { + let db = create_test_database()?; + populate_database(db) +} + +fn generate_random_vectors(dimension: usize, len: usize) -> Vec> { + (0..len) + .map(|_| (0..dimension).map(|_| random::()).collect()) + .collect() +} + +fn populate_database(database: Database) -> Result { + let fields = ["vector", "title", "year"]; + let field_names: Vec = + fields.iter().map(|f| f.to_string()).collect(); + + let vectors = generate_random_vectors(128, 3); + let titles = vec!["The Matrix", "Avatar", "Inception"]; + let years = vec![1999, 2009, 2010]; + + let records = vec![ + Arc::new(array::ListArray::from_vectors(vectors)) as Arc, + Arc::new(array::StringArray::from(titles)) as Arc, + Arc::new(array::Int32Array::from(years)) as Arc, + ]; + + database._insert_records(TEST_COLLECTION, &field_names, &records)?; + Ok(database) +} diff --git a/src/tests/test_database.rs b/src/tests/test_database.rs index 41cb8745..0525f509 100644 --- a/src/tests/test_database.rs +++ b/src/tests/test_database.rs @@ -60,3 +60,31 @@ fn test_database_remove_fields() -> Result<(), Error> { Ok(()) } + +#[test] +fn test_database_insert_records() -> Result<(), Error> { + let database = create_test_database_with_data()?; + let state = database.state()?; + let dir = &state.collection_refs[TEST_COLLECTION]; + + let fields = ["vector", "title", "year"]; + let fields: Vec = fields.iter().map(|f| f.to_string()).collect(); + + let vectors = generate_random_vectors(128, 2); + let titles = vec!["Interstellar", "Avengers: Endgame"]; + let years = vec![2014, 2019]; + + let records = vec![ + Arc::new(array::ListArray::from_vectors(vectors)) as Arc, + Arc::new(array::StringArray::from(titles)) as Arc, + Arc::new(array::Int32Array::from(years)) as Arc, + ]; + + database._insert_records(TEST_COLLECTION, &fields, &records)?; + + let collection = Collection::open(dir.clone())?; + let state = collection.state()?; + assert_eq!(state.count, 5); + + Ok(()) +} diff --git a/src/types/error.rs b/src/types/error.rs index 42a62ace..545b527e 100644 --- a/src/types/error.rs +++ b/src/types/error.rs @@ -11,12 +11,14 @@ use std::sync::PoisonError; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ErrorCode { ArrowError, - ClientError, - CollectionError, ConcurrencyError, FileError, SerializationError, StandardError, + + // Tonic-related error codes. + ClientError, + NotFoundError, } #[derive(Debug)] @@ -84,6 +86,7 @@ impl From for tonic::Status { fn from(err: Error) -> Self { let code = match err.code { ErrorCode::ClientError => tonic::Code::InvalidArgument, + ErrorCode::NotFoundError => tonic::Code::NotFound, _ => tonic::Code::Internal, }; diff --git a/src/types/file.rs b/src/types/file.rs index b50adb7a..04cb358e 100644 --- a/src/types/file.rs +++ b/src/types/file.rs @@ -1,6 +1,10 @@ use super::error::{Error, ErrorCode}; +use arrow::array::RecordBatch; +use arrow::ipc::reader::FileReader; +use arrow::ipc::writer::FileWriter; use serde::de::DeserializeOwned; use serde::Serialize; +use std::cmp::min; use std::env; use std::fs::{self, OpenOptions}; use std::io::{BufReader, BufWriter}; @@ -66,6 +70,69 @@ impl FileOps { Ok(()) } + pub fn read_ipc_file(&self, path: &PathBuf) -> Result { + let file = OpenOptions::new().read(true).open(path)?; + let reader = BufReader::new(file); + let ipc_reader = FileReader::try_new(reader, None)?; + let schema = ipc_reader.schema(); + + // In OasyDB, there will be only one record batch per file. + let record_batch = match ipc_reader.last() { + Some(batch) => batch?, + _ => RecordBatch::new_empty(schema), + }; + + Ok(record_batch) + } + + pub fn write_ipc_files( + &self, + paths: &[PathBuf], + data: &RecordBatch, + batch_size: usize, + ) -> Result<(), Error> { + let create_tmp_path = |path: &PathBuf| { + let filename = self.parse_file_name(path).unwrap(); + self.tmp_dir.join(filename) + }; + + let tmp_paths: Vec = + paths.iter().map(create_tmp_path).collect(); + + let schema = data.schema(); + + for i in 0..tmp_paths.len() { + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(&tmp_paths[i])?; + + let writer = BufWriter::new(file); + let mut ipc_writer = FileWriter::try_new(writer, &schema)?; + + // This attempts to write the record batch in chunks. + // This is useful when the record batch is larger than + // the predefined batch size. + let batch = { + let offset = i * batch_size; + let length = min(batch_size, data.num_rows() - offset); + data.slice(offset, length) + }; + + // Write the record batch to the file. + ipc_writer.write(&batch)?; + ipc_writer.finish()?; + } + + // If the serialization is successful, rename the temporary file. + for i in 0..tmp_paths.len() { + fs::rename(&tmp_paths[i], &paths[i])?; + } + + Ok(()) + } + /// Parses a file name from a path. pub fn parse_file_name(&self, path: &PathBuf) -> Result { path.file_name() diff --git a/src/types/metadata.rs b/src/types/metadata.rs index 0a4edfab..224fdfa3 100644 --- a/src/types/metadata.rs +++ b/src/types/metadata.rs @@ -38,6 +38,19 @@ impl From for MetadataType { } } +impl From for MetadataType { + fn from(value: DataType) -> Self { + match value { + DataType::Int32 => MetadataType::Integer, + DataType::Float32 => MetadataType::Float, + DataType::Utf8 => MetadataType::String, + DataType::Boolean => MetadataType::Boolean, + DataType::List(_) => MetadataType::Vector, + _ => panic!("Unsupported data type: {value}"), + } + } +} + impl From for DataType { fn from(value: MetadataType) -> Self { let field_float = Field::new("element", DataType::Float32, false); From c1ea53c308460c075b327e2e43cb684ff0acd050 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 29 Jun 2024 19:38:03 -0500 Subject: [PATCH 35/88] feat: lint optimization --- src/db/database.rs | 3 +-- src/db/database_service.rs | 6 +++--- src/db/mod.rs | 2 +- src/main.rs | 5 +++-- src/types/error.rs | 1 - src/types/file.rs | 4 ++-- src/types/metadata.rs | 1 + 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index ea6f4c38..07c345c0 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -7,7 +7,6 @@ pub struct DatabaseState { } struct Directory { - pub root: PathBuf, pub collections_dir: PathBuf, pub state_file: PathBuf, } @@ -16,7 +15,7 @@ impl Directory { fn new(root: PathBuf) -> Self { let collections_dir = root.join("collections"); let state_file = root.join("dbstate"); - Self { root, collections_dir, state_file } + Self { collections_dir, state_file } } } diff --git a/src/db/database_service.rs b/src/db/database_service.rs index e8c07d26..36cac474 100644 --- a/src/db/database_service.rs +++ b/src/db/database_service.rs @@ -2,7 +2,7 @@ use super::*; use proto::database_server::Database as ProtoDatabase; #[tonic::async_trait] -impl ProtoDatabase for Database { +impl ProtoDatabase for Arc { async fn create_collection( &self, request: Request, @@ -103,9 +103,9 @@ impl ProtoDatabase for Database { // Convert records from row format to column format. let mut columns = vec![vec![]; field_names.len()]; for record in records { - for i in 0..field_names.len() { + for (i, column) in columns.iter_mut().enumerate() { let value = record.data[i].value.clone(); - columns[i].push(value); + column.push(value); } } diff --git a/src/db/mod.rs b/src/db/mod.rs index 8b7b2ff5..52c6b85e 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -151,7 +151,7 @@ impl ArrayUtils for ListArray { let dimension = values .clone() .into_par_iter() - .map(|value| value.unwrap_or(vec![]).len()) + .map(|value| value.unwrap_or_default().len()) .max() // 1024 is the default capacity for generic array builders. .unwrap_or(1024); diff --git a/src/main.rs b/src/main.rs index feb91e6c..f89f5708 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,4 @@ -#![allow(dead_code)] +#![allow(clippy::enum_variant_names)] mod db; mod proto; @@ -10,6 +10,7 @@ mod tests; use db::*; use proto::database_server::DatabaseServer; use std::path::PathBuf; +use std::sync::Arc; use tonic::transport::Server; const HOST: &str = "0.0.0.0"; @@ -20,7 +21,7 @@ async fn main() -> Result<(), Box> { let addr = format!("{HOST}:{PORT}").parse()?; let path = PathBuf::from("odb_data"); - let database = Database::open(path)?; + let database = Arc::new(Database::open(path)?); Server::builder() .add_service(DatabaseServer::new(database)) diff --git a/src/types/error.rs b/src/types/error.rs index 545b527e..554878b8 100644 --- a/src/types/error.rs +++ b/src/types/error.rs @@ -7,7 +7,6 @@ use std::error::Error as StandardError; use std::io::Error as IOError; use std::sync::PoisonError; -#[allow(clippy::enum_variant_names)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ErrorCode { ArrowError, diff --git a/src/types/file.rs b/src/types/file.rs index 04cb358e..145037eb 100644 --- a/src/types/file.rs +++ b/src/types/file.rs @@ -101,12 +101,12 @@ impl FileOps { let schema = data.schema(); - for i in 0..tmp_paths.len() { + for (i, tmp_path) in tmp_paths.iter().enumerate() { let file = OpenOptions::new() .write(true) .create(true) .truncate(true) - .open(&tmp_paths[i])?; + .open(tmp_path)?; let writer = BufWriter::new(file); let mut ipc_writer = FileWriter::try_new(writer, &schema)?; diff --git a/src/types/metadata.rs b/src/types/metadata.rs index 224fdfa3..a798789f 100644 --- a/src/types/metadata.rs +++ b/src/types/metadata.rs @@ -1,6 +1,7 @@ use arrow_schema::{DataType, Field}; /// Data types supported in OasysDB Arrow fields. +#[derive(Debug, Clone, PartialEq)] pub enum MetadataType { Integer, Float, From 66bc7b1c757959ba7c0eac3f1607f419ecbeaa3c Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 1 Jul 2024 13:15:18 -0500 Subject: [PATCH 36/88] feat: add stress test db module --- src/db/collection_utils.rs | 2 +- src/tests/mod.rs | 1 + src/tests/stress_test_database.rs | 32 +++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 src/tests/stress_test_database.rs diff --git a/src/db/collection_utils.rs b/src/db/collection_utils.rs index 638bd8ce..07fe3852 100644 --- a/src/db/collection_utils.rs +++ b/src/db/collection_utils.rs @@ -13,7 +13,7 @@ impl Collection { // We only allow lowercase letters and underscores in the names. // Also, we can unwrap here because the regex pattern is hardcoded. - let re = Regex::new(r"^[a-z_]+$").unwrap(); + let re = Regex::new(r"^[a-z0-9_]+$").unwrap(); if !re.is_match(name) { return Err(Error::new( &ErrorCode::ClientError, diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 0313c85d..94792833 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -7,6 +7,7 @@ use std::fs; use std::path::PathBuf; use std::sync::Arc; +mod stress_test_database; mod test_database; const TEST_DIR: &str = "odb_data"; diff --git a/src/tests/stress_test_database.rs b/src/tests/stress_test_database.rs new file mode 100644 index 00000000..1b5267db --- /dev/null +++ b/src/tests/stress_test_database.rs @@ -0,0 +1,32 @@ +use super::*; + +const RECORDS_LEN: usize = 10_000; + +#[test] +fn test_database_insert_many_records() -> Result<(), Error> { + let path = PathBuf::from(TEST_DIR); + if path.exists() { + fs::remove_dir_all(&path)?; + } + + let database = Database::open(path)?; + + let collection_name = "collection"; + database._create_collection(collection_name)?; + + let fields = vec!["vector".to_string()]; + let vectors = generate_random_vectors(128, RECORDS_LEN); + let records = vec![ + Arc::new(array::ListArray::from_vectors(vectors)) as Arc + ]; + + database._insert_records(collection_name, &fields, &records)?; + + let state = database.state()?; + let collection_dir = &state.collection_refs[collection_name]; + let collection = Collection::open(collection_dir.clone())?; + let state = collection.state()?; + assert_eq!(state.count, RECORDS_LEN); + + Ok(()) +} From 02b4fbce1e55824f00a9f70aa0cc05c2aea56c27 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 1 Jul 2024 13:44:51 -0500 Subject: [PATCH 37/88] docs: improve collection state creation --- src/db/collection.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/db/collection.rs b/src/db/collection.rs index 2f1f04b7..93e683dc 100644 --- a/src/db/collection.rs +++ b/src/db/collection.rs @@ -14,7 +14,9 @@ pub struct CollectionState { } impl CollectionState { - fn new(dir: PathBuf) -> Result { + /// Creates a new collection state. + /// - `root`: Root directory for the collection. + fn new(root: PathBuf) -> Result { let field_id = Field::new("internal_id", DataType::Int32, false); let vector_type = MetadataType::Vector.into(); @@ -22,7 +24,7 @@ impl CollectionState { let mut state = Self { schema: Schema::new(vec![field_id, field_vector]), - dir: Directory::new(dir), + dir: Directory::new(root), batch_size: 1000, count: 0, dimension: 0, @@ -322,9 +324,9 @@ impl Collection { impl StateMachine for Collection { fn initialize_state( - root: impl Into, + path: impl Into, ) -> Result { - let state = CollectionState::new(root.into())?; + let state = CollectionState::new(path.into())?; FileOps::default().write_binary_file(&state.dir.state_file, &state)?; Ok(state) } From b57124ed87c91cf99228a0a976074b6afe5cd9b8 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Thu, 4 Jul 2024 20:14:30 -0500 Subject: [PATCH 38/88] chore: reset oasysdb again --- .github/ISSUE_TEMPLATE/bug_report.md | 2 +- .github/ISSUE_TEMPLATE/config.yml | 4 +- .github/ISSUE_TEMPLATE/do_chore.md | 2 +- .github/ISSUE_TEMPLATE/feature_request.md | 2 +- .../{server-ci.yml => database-ci.yml} | 2 +- .github/workflows/publish-docs.yml | 2 +- .prettierrc.yml | 1 + Cargo.lock | 1863 ----------------- Cargo.toml | 21 - build.rs | 4 - docs/blog/posts/overhauling_oasysdb.md | 65 - docs/code_of_conduct.md | 12 +- docs/concepts/terms.md | 8 +- docs/contributing.md | 54 +- protos/database.proto | 72 - src/db/collection.rs | 347 --- src/db/collection_utils.rs | 54 - src/db/database.rs | 176 -- src/db/database_service.rs | 131 -- src/db/mod.rs | 215 -- src/indices/mod.rs | 1 + src/lib.rs | 2 + src/main.rs | 33 +- src/proto.rs | 1 - src/tests/mod.rs | 66 - src/tests/stress_test_database.rs | 32 - src/tests/test_database.rs | 90 - src/types/error.rs | 94 - src/types/file.rs | 147 -- src/types/metadata.rs | 66 - src/types/mod.rs | 6 - 31 files changed, 44 insertions(+), 3531 deletions(-) rename .github/workflows/{server-ci.yml => database-ci.yml} (98%) delete mode 100644 build.rs delete mode 100644 docs/blog/posts/overhauling_oasysdb.md delete mode 100644 protos/database.proto delete mode 100644 src/db/collection.rs delete mode 100644 src/db/collection_utils.rs delete mode 100644 src/db/database.rs delete mode 100644 src/db/database_service.rs create mode 100644 src/indices/mod.rs create mode 100644 src/lib.rs delete mode 100644 src/proto.rs delete mode 100644 src/tests/stress_test_database.rs delete mode 100644 src/tests/test_database.rs delete mode 100644 src/types/error.rs delete mode 100644 src/types/file.rs delete mode 100644 src/types/metadata.rs diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 52751fc9..e8190b64 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,5 +1,5 @@ --- -name: 🐞 Report a bug +name: 🐞 Report Bug about: Report an unexpected behavior or a malfunctioning feature. title: "BUG:" labels: bug diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 36b1d751..76c0d5d1 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,10 @@ blank_issues_enabled: false + contact_links: - - name: ❓ Ask a question + - name: ❓ Ask Question url: https://github.com/oasysai/oasysdb/discussions about: Ask general questions or share ideas on Discussions. + - name: 💬 Join Discord url: https://discord.gg/bDhQrkqNP4 about: Join the Discord server to help shape the future of OasysDB. diff --git a/.github/ISSUE_TEMPLATE/do_chore.md b/.github/ISSUE_TEMPLATE/do_chore.md index ede33ed3..8253c794 100644 --- a/.github/ISSUE_TEMPLATE/do_chore.md +++ b/.github/ISSUE_TEMPLATE/do_chore.md @@ -1,5 +1,5 @@ --- -name: 🧹 Do a chore +name: 🧹 Do Chore about: Documentation updates, code refactoring, or other chores. title: "CHORE:" labels: chore diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 1381813a..144293f7 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,5 +1,5 @@ --- -name: 🛠️ Request a feature +name: 🛠️ Feature Request about: Request a new feature or an improvement to an existing feature. title: "FEAT:" labels: enhancement diff --git a/.github/workflows/server-ci.yml b/.github/workflows/database-ci.yml similarity index 98% rename from .github/workflows/server-ci.yml rename to .github/workflows/database-ci.yml index b1b84cae..70125487 100644 --- a/.github/workflows/server-ci.yml +++ b/.github/workflows/database-ci.yml @@ -1,4 +1,4 @@ -name: CI checks for the server +name: Test Database on: workflow_dispatch: diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml index 5c0b32ae..ce142f58 100644 --- a/.github/workflows/publish-docs.yml +++ b/.github/workflows/publish-docs.yml @@ -1,4 +1,4 @@ -name: Publish documentation +name: Publish Docs on: workflow_dispatch: diff --git a/.prettierrc.yml b/.prettierrc.yml index 1ab6bdbe..751b51d5 100644 --- a/.prettierrc.yml +++ b/.prettierrc.yml @@ -2,5 +2,6 @@ bracketSpacing: true singleQuote: false trailingComma: "none" semi: false +tabWidth: 2 printWidth: 80 proseWrap: "always" diff --git a/Cargo.lock b/Cargo.lock index 9ecce5c0..7cad23ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,1869 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "addr2line" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "ahash" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" -dependencies = [ - "cfg-if", - "const-random", - "getrandom", - "once_cell", - "version_check", - "zerocopy", -] - -[[package]] -name = "aho-corasick" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" -dependencies = [ - "memchr", -] - -[[package]] -name = "android-tzdata" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" - -[[package]] -name = "android_system_properties" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" -dependencies = [ - "libc", -] - -[[package]] -name = "anyhow" -version = "1.0.86" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" - -[[package]] -name = "arrow" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ae9728f104939be6d8d9b368a354b4929b0569160ea1641f0721b55a861ce38" -dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-csv", - "arrow-data", - "arrow-ipc", - "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", -] - -[[package]] -name = "arrow-arith" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7029a5b3efbeafbf4a12d12dc16b8f9e9bff20a410b8c25c5d28acc089e1043" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "half", - "num", -] - -[[package]] -name = "arrow-array" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d33238427c60271710695f17742f45b1a5dc5bcfc5c15331c25ddfe7abf70d97" -dependencies = [ - "ahash", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "half", - "hashbrown 0.14.5", - "num", -] - -[[package]] -name = "arrow-buffer" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe9b95e825ae838efaf77e366c00d3fc8cca78134c9db497d6bda425f2e7b7c1" -dependencies = [ - "bytes", - "half", - "num", -] - -[[package]] -name = "arrow-cast" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cf8385a9d5b5fcde771661dd07652b79b9139fea66193eda6a88664400ccab" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "atoi", - "base64 0.22.1", - "chrono", - "half", - "lexical-core", - "num", - "ryu", -] - -[[package]] -name = "arrow-csv" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea5068bef430a86690059665e40034625ec323ffa4dd21972048eebb0127adc" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "chrono", - "csv", - "csv-core", - "lazy_static", - "lexical-core", - "regex", -] - -[[package]] -name = "arrow-data" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb29be98f987bcf217b070512bb7afba2f65180858bca462edf4a39d84a23e10" -dependencies = [ - "arrow-buffer", - "arrow-schema", - "half", - "num", -] - -[[package]] -name = "arrow-ipc" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffc68f6523970aa6f7ce1dc9a33a7d9284cfb9af77d4ad3e617dbe5d79cc6ec8" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "flatbuffers", -] - -[[package]] -name = "arrow-json" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2041380f94bd6437ab648e6c2085a045e45a0c44f91a1b9a4fe3fed3d379bfb1" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "chrono", - "half", - "indexmap 2.2.6", - "lexical-core", - "num", - "serde", - "serde_json", -] - -[[package]] -name = "arrow-ord" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb56ed1547004e12203652f12fe12e824161ff9d1e5cf2a7dc4ff02ba94f413" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "half", - "num", -] - -[[package]] -name = "arrow-row" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "575b42f1fc588f2da6977b94a5ca565459f5ab07b60545e17243fb9a7ed6d43e" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "half", - "hashbrown 0.14.5", -] - -[[package]] -name = "arrow-schema" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32aae6a60458a2389c0da89c9de0b7932427776127da1a738e2efc21d32f3393" -dependencies = [ - "serde", -] - -[[package]] -name = "arrow-select" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de36abaef8767b4220d7b4a8c2fe5ffc78b47db81b03d77e2136091c3ba39102" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "num", -] - -[[package]] -name = "arrow-string" -version = "52.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e435ada8409bcafc910bc3e0077f532a4daa20e99060a496685c0e3e53cc2597" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "memchr", - "num", - "regex", - "regex-syntax", -] - -[[package]] -name = "async-stream" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" -dependencies = [ - "async-stream-impl", - "futures-core", - "pin-project-lite", -] - -[[package]] -name = "async-stream-impl" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "async-trait" -version = "0.1.80" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "atoi" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" -dependencies = [ - "num-traits", -] - -[[package]] -name = "autocfg" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" - -[[package]] -name = "axum" -version = "0.6.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf" -dependencies = [ - "async-trait", - "axum-core", - "bitflags 1.3.2", - "bytes", - "futures-util", - "http", - "http-body", - "hyper", - "itoa", - "matchit", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "rustversion", - "serde", - "sync_wrapper", - "tower", - "tower-layer", - "tower-service", -] - -[[package]] -name = "axum-core" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c" -dependencies = [ - "async-trait", - "bytes", - "futures-util", - "http", - "http-body", - "mime", - "rustversion", - "tower-layer", - "tower-service", -] - -[[package]] -name = "backtrace" -version = "0.3.73" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" -dependencies = [ - "addr2line", - "cc", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - -[[package]] -name = "base64" -version = "0.21.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" - -[[package]] -name = "base64" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" - -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" - -[[package]] -name = "bumpalo" -version = "3.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" - -[[package]] -name = "bytes" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" - -[[package]] -name = "cc" -version = "1.0.99" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "chrono" -version = "0.4.38" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" -dependencies = [ - "android-tzdata", - "iana-time-zone", - "num-traits", - "windows-targets 0.52.5", -] - -[[package]] -name = "const-random" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" -dependencies = [ - "const-random-macro", -] - -[[package]] -name = "const-random-macro" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" -dependencies = [ - "getrandom", - "once_cell", - "tiny-keccak", -] - -[[package]] -name = "core-foundation-sys" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" - -[[package]] -name = "crossbeam-deque" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" -dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" -dependencies = [ - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" - -[[package]] -name = "crunchy" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" - -[[package]] -name = "csv" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" -dependencies = [ - "memchr", -] - -[[package]] -name = "either" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" - -[[package]] -name = "equivalent" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" - -[[package]] -name = "errno" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "fastrand" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" - -[[package]] -name = "fixedbitset" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" - -[[package]] -name = "flatbuffers" -version = "24.3.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" -dependencies = [ - "bitflags 1.3.2", - "rustc_version", -] - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "futures-channel" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" -dependencies = [ - "futures-core", -] - -[[package]] -name = "futures-core" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" - -[[package]] -name = "futures-sink" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" - -[[package]] -name = "futures-task" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" - -[[package]] -name = "futures-util" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" -dependencies = [ - "futures-core", - "futures-task", - "pin-project-lite", - "pin-utils", -] - -[[package]] -name = "getrandom" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - -[[package]] -name = "gimli" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" - -[[package]] -name = "h2" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http", - "indexmap 2.2.6", - "slab", - "tokio", - "tokio-util", - "tracing", -] - -[[package]] -name = "half" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" -dependencies = [ - "cfg-if", - "crunchy", - "num-traits", -] - -[[package]] -name = "hashbrown" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" - -[[package]] -name = "hashbrown" -version = "0.14.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" - -[[package]] -name = "heck" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" - -[[package]] -name = "hermit-abi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" - -[[package]] -name = "http" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - -[[package]] -name = "http-body" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" -dependencies = [ - "bytes", - "http", - "pin-project-lite", -] - -[[package]] -name = "httparse" -version = "1.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" - -[[package]] -name = "httpdate" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" - -[[package]] -name = "hyper" -version = "0.14.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f361cde2f109281a220d4307746cdfd5ee3f410da58a70377762396775634b33" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "httparse", - "httpdate", - "itoa", - "pin-project-lite", - "socket2", - "tokio", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "hyper-timeout" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" -dependencies = [ - "hyper", - "pin-project-lite", - "tokio", - "tokio-io-timeout", -] - -[[package]] -name = "iana-time-zone" -version = "0.1.60" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" -dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "wasm-bindgen", - "windows-core", -] - -[[package]] -name = "iana-time-zone-haiku" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" -dependencies = [ - "cc", -] - -[[package]] -name = "indexmap" -version = "1.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" -dependencies = [ - "autocfg", - "hashbrown 0.12.3", -] - -[[package]] -name = "indexmap" -version = "2.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" -dependencies = [ - "equivalent", - "hashbrown 0.14.5", -] - -[[package]] -name = "itertools" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" - -[[package]] -name = "js-sys" -version = "0.3.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" -dependencies = [ - "wasm-bindgen", -] - -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - -[[package]] -name = "lexical-core" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" -dependencies = [ - "lexical-parse-float", - "lexical-parse-integer", - "lexical-util", - "lexical-write-float", - "lexical-write-integer", -] - -[[package]] -name = "lexical-parse-float" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" -dependencies = [ - "lexical-parse-integer", - "lexical-util", - "static_assertions", -] - -[[package]] -name = "lexical-parse-integer" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" -dependencies = [ - "lexical-util", - "static_assertions", -] - -[[package]] -name = "lexical-util" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" -dependencies = [ - "static_assertions", -] - -[[package]] -name = "lexical-write-float" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" -dependencies = [ - "lexical-util", - "lexical-write-integer", - "static_assertions", -] - -[[package]] -name = "lexical-write-integer" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" -dependencies = [ - "lexical-util", - "static_assertions", -] - -[[package]] -name = "libc" -version = "0.2.155" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" - -[[package]] -name = "libm" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" - -[[package]] -name = "linux-raw-sys" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" - -[[package]] -name = "log" -version = "0.4.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" - -[[package]] -name = "matchit" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" - -[[package]] -name = "memchr" -version = "2.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" - -[[package]] -name = "mime" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" - -[[package]] -name = "miniz_oxide" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" -dependencies = [ - "adler", -] - -[[package]] -name = "mio" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" -dependencies = [ - "libc", - "wasi", - "windows-sys 0.48.0", -] - -[[package]] -name = "multimap" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" - -[[package]] -name = "num" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" -dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", -] - -[[package]] -name = "num-bigint" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c165a9ab64cf766f73521c0dd2cfdff64f488b8f0b3e621face3462d3db536d7" -dependencies = [ - "num-integer", - "num-traits", -] - -[[package]] -name = "num-complex" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-integer" -version = "0.1.46" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-iter" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" -dependencies = [ - "autocfg", - "libm", -] - -[[package]] -name = "num_cpus" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "oasysdb" version = "0.7.0" -dependencies = [ - "arrow", - "arrow-schema", - "bincode", - "prost", - "rand", - "rayon", - "regex", - "serde", - "tokio", - "tonic", - "tonic-build", - "uuid", -] - -[[package]] -name = "object" -version = "0.36.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434" -dependencies = [ - "memchr", -] - -[[package]] -name = "once_cell" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" - -[[package]] -name = "percent-encoding" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" - -[[package]] -name = "petgraph" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" -dependencies = [ - "fixedbitset", - "indexmap 2.2.6", -] - -[[package]] -name = "pin-project" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "ppv-lite86" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" - -[[package]] -name = "prettyplease" -version = "0.2.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" -dependencies = [ - "proc-macro2", - "syn", -] - -[[package]] -name = "proc-macro2" -version = "1.0.86" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "prost" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" -dependencies = [ - "bytes", - "prost-derive", -] - -[[package]] -name = "prost-build" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" -dependencies = [ - "bytes", - "heck", - "itertools", - "log", - "multimap", - "once_cell", - "petgraph", - "prettyplease", - "prost", - "prost-types", - "regex", - "syn", - "tempfile", -] - -[[package]] -name = "prost-derive" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" -dependencies = [ - "anyhow", - "itertools", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "prost-types" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" -dependencies = [ - "prost", -] - -[[package]] -name = "quote" -version = "1.0.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rayon" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" -dependencies = [ - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" -dependencies = [ - "crossbeam-deque", - "crossbeam-utils", -] - -[[package]] -name = "regex" -version = "1.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" - -[[package]] -name = "rustc-demangle" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" - -[[package]] -name = "rustc_version" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" -dependencies = [ - "semver", -] - -[[package]] -name = "rustix" -version = "0.38.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" -dependencies = [ - "bitflags 2.5.0", - "errno", - "libc", - "linux-raw-sys", - "windows-sys 0.52.0", -] - -[[package]] -name = "rustversion" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" - -[[package]] -name = "ryu" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" - -[[package]] -name = "semver" -version = "1.0.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" - -[[package]] -name = "serde" -version = "1.0.203" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.203" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.117" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "455182ea6142b14f93f4bc5320a2b31c1f266b66a4a5c858b013302a5d8cbfc3" -dependencies = [ - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "slab" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] - -[[package]] -name = "socket2" -version = "0.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - -[[package]] -name = "syn" -version = "2.0.67" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff8655ed1d86f3af4ee3fd3263786bc14245ad17c4c7e85ba7187fb3ae028c90" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "sync_wrapper" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" - -[[package]] -name = "tempfile" -version = "3.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" -dependencies = [ - "cfg-if", - "fastrand", - "rustix", - "windows-sys 0.52.0", -] - -[[package]] -name = "tiny-keccak" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" -dependencies = [ - "crunchy", -] - -[[package]] -name = "tokio" -version = "1.38.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" -dependencies = [ - "backtrace", - "bytes", - "libc", - "mio", - "num_cpus", - "pin-project-lite", - "socket2", - "tokio-macros", - "windows-sys 0.48.0", -] - -[[package]] -name = "tokio-io-timeout" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" -dependencies = [ - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-macros" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "tokio-stream" -version = "0.1.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-util" -version = "0.7.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tonic" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76c4eb7a4e9ef9d4763600161f12f5070b92a578e1b634db88a6887844c91a13" -dependencies = [ - "async-stream", - "async-trait", - "axum", - "base64 0.21.7", - "bytes", - "h2", - "http", - "http-body", - "hyper", - "hyper-timeout", - "percent-encoding", - "pin-project", - "prost", - "tokio", - "tokio-stream", - "tower", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tonic-build" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4ef6dd70a610078cb4e338a0f79d06bc759ff1b22d2120c2ff02ae264ba9c2" -dependencies = [ - "prettyplease", - "proc-macro2", - "prost-build", - "quote", - "syn", -] - -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "indexmap 1.9.3", - "pin-project", - "pin-project-lite", - "rand", - "slab", - "tokio", - "tokio-util", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tower-layer" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" - -[[package]] -name = "tower-service" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" - -[[package]] -name = "tracing" -version = "0.1.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" -dependencies = [ - "pin-project-lite", - "tracing-attributes", - "tracing-core", -] - -[[package]] -name = "tracing-attributes" -version = "0.1.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "tracing-core" -version = "0.1.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" -dependencies = [ - "once_cell", -] - -[[package]] -name = "try-lock" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" - -[[package]] -name = "unicode-ident" -version = "1.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" - -[[package]] -name = "uuid" -version = "1.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439" -dependencies = [ - "getrandom", -] - -[[package]] -name = "version_check" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" - -[[package]] -name = "want" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" -dependencies = [ - "try-lock", -] - -[[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" - -[[package]] -name = "wasm-bindgen" -version = "0.2.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" -dependencies = [ - "cfg-if", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" -dependencies = [ - "bumpalo", - "log", - "once_cell", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" - -[[package]] -name = "windows-core" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" -dependencies = [ - "windows-targets 0.52.5", -] - -[[package]] -name = "windows-sys" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" -dependencies = [ - "windows-targets 0.48.5", -] - -[[package]] -name = "windows-sys" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" -dependencies = [ - "windows-targets 0.52.5", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", -] - -[[package]] -name = "windows-targets" -version = "0.52.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" -dependencies = [ - "windows_aarch64_gnullvm 0.52.5", - "windows_aarch64_msvc 0.52.5", - "windows_i686_gnu 0.52.5", - "windows_i686_gnullvm", - "windows_i686_msvc 0.52.5", - "windows_x86_64_gnu 0.52.5", - "windows_x86_64_gnullvm 0.52.5", - "windows_x86_64_msvc 0.52.5", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.52.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" - -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - -[[package]] -name = "windows_i686_gnu" -version = "0.52.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" - -[[package]] -name = "windows_i686_gnullvm" -version = "0.52.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" - -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - -[[package]] -name = "windows_i686_msvc" -version = "0.52.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.52.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.52.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" - -[[package]] -name = "zerocopy" -version = "0.7.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae87e3fcd617500e5d106f0380cf7b77f3c6092aae37191433159dda23cfb087" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.7.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] diff --git a/Cargo.toml b/Cargo.toml index 032d179b..08d123f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,28 +15,7 @@ repository = "https://github.com/oasysai/oasysdb" keywords = ["vector", "database", "anns", "search", "simd"] categories = ["database", "algorithms", "data-structures"] -[dependencies.tokio] -version = "1.38.0" -features = ["macros", "rt-multi-thread"] - [dependencies] -uuid = { version = "1.9.1", features = ["v4", "rng"] } -arrow-schema = { version = "52.0.0", features = ["serde"] } -arrow = "52.0.0" -rayon = "1.10.0" -regex = "1.10.5" -rand = "0.8.5" - -# gRPC stuff. -prost = "0.12.6" -tonic = "0.11.0" - -# Serialization. -serde = { version = "1.0.203", features = ["derive"] } -bincode = "1.3.3" - -[build-dependencies] -tonic-build = "0.11.0" [profile.release] lto = true diff --git a/build.rs b/build.rs deleted file mode 100644 index 5e0f7ef3..00000000 --- a/build.rs +++ /dev/null @@ -1,4 +0,0 @@ -fn main() -> Result<(), Box> { - tonic_build::compile_protos("protos/database.proto")?; - Ok(()) -} diff --git a/docs/blog/posts/overhauling_oasysdb.md b/docs/blog/posts/overhauling_oasysdb.md deleted file mode 100644 index 26e905f3..00000000 --- a/docs/blog/posts/overhauling_oasysdb.md +++ /dev/null @@ -1,65 +0,0 @@ ---- -date: 2024-06-22 - -authors: - - edwinkys - -categories: - - Log ---- - -# DevLog #1: OasysDB Overhaul - -OasysDB is a project that I started in January of this year, and honestly, it -has been an incredible learning experience. With it, I've gained quite extensive -experience in databases, machine learning algorithms, and low-level programming -concepts. But, with this knowledge, I realize that the current design of OasysDB -is not enough for production use. - - - -After careful consideration, I've decided to rewrite OasysDB from the ground up. -The new version will be designed to incorporate all the essential features -needed for a production-ready vector database system. - -This includes, but is not limited to: - -- Transitioning from an embedded to a client-server model for better scalability - and isolation. -- Designing an efficient storage engine tailored for analytical production - workloads. -- Implementing concurrent query processing to improve throughput and reduce - latency. -- Utilizing advanced vector indexing algorithms for enhanced recall performance, - especially in hybrid search scenarios. -- Incorporating an industry-standard query planner and optimizer to enhance - query performance. -- Enhancing documentation and testing to ensure the system's robustness and - reliability. - -Here's a high-level overview of the new architecture: - -![OasysDB Architecture](https://i.postimg.cc/QdVVSs3M/Infrastructure.png) - -## Progress Update - -Today, I started working on the new version of OasysDB. I've established the -project structure, implemented the foundational data structures for the -collection and storage engine, and set up the initial framework for -client-server communication. - -I will be posting regular updates (once or twice a week) on my progress, which -may include in-depth explorations of the system's technical aspects. If you want -to follow along with the development process, you can find the project on -GitHub: [OasysDB](https://github.com/oasysai/oasysdb). - -## Conclusion - -I'm really excited about the potential of the new OasysDB and the challenges -that lie ahead. I believe this overhaul will lead to a robust and scalable -vector database system perfect for a wide range of AI applications. - -If you're into databases and AI, I encourage you to follow along with the -development process as I share my insights, challenges, and victories in this -DevLog series. If you have experience in this field, your feedback and -suggestions would be greatly appreciated. diff --git a/docs/code_of_conduct.md b/docs/code_of_conduct.md index 66f76607..5037ebf8 100644 --- a/docs/code_of_conduct.md +++ b/docs/code_of_conduct.md @@ -116,17 +116,17 @@ community. This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.1, available at -[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][source]. Community Impact Guidelines were inspired by [Mozilla's code of conduct -enforcement ladder][Mozilla CoC]. +enforcement ladder][mozilla_coc]. For answers to common questions about this code of conduct, see the FAQ at -[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/faq][faq]. Translations are available at [https://www.contributor-covenant.org/translations][translations]. [homepage]: https://www.contributor-covenant.org -[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html -[Mozilla CoC]: https://github.com/mozilla/diversity -[FAQ]: https://www.contributor-covenant.org/faq +[source]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[mozilla_coc]: https://github.com/mozilla/diversity +[faq]: https://www.contributor-covenant.org/faq [translations]: https://www.contributor-covenant.org/translations diff --git a/docs/concepts/terms.md b/docs/concepts/terms.md index 652d6d55..89c353f5 100644 --- a/docs/concepts/terms.md +++ b/docs/concepts/terms.md @@ -75,19 +75,21 @@ points based on the distances between their embeddings: 4 -> 3, 2 ``` -### ANNS +## ANNS ANNS is a technique for efficiently finding the nearest data points to a given query, albeit approximately. While it may not always return the exact nearest data points, ANNS provides results that are close enough. This probabilistic approach balances accuracy with efficiency. -Imagine we have a query with specific constraints: +Let's take the index we have created in the previous section as an example. +Imagine we have a query with these specific constraints: - Find the closest data to [0.0, 0.9]. - Calculate a maximum of 2 distances using the Euclidean distance formula. -Here's how we utilize the index created above to find the closest data point: +Here's how we can utilize the index to find the closest data point based on this +constraint: 1. We start at a random data point, say 4, which is linked to 3 and 2. 2. We calculate the distances and find that 2 is closer to [0.0, 0.9] than 3. diff --git a/docs/contributing.md b/docs/contributing.md index 57bd3111..1c1164a5 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -13,20 +13,18 @@ to act respectfully and to follow the [Code of Conduct](code_of_conduct.md). ## Have questions or suggestions? -[![Discord](https://img.shields.io/discord/1182432298382131200?logo=discord&logoColor=%23ffffff&label=Discord&labelColor=%235865F2&style=for-the-badge)](https://discord.gg/bDhQrkqNP4) +[![Discord](https://img.shields.io/discord/1182432298382131200?logo=discord&logoColor=%23ffffff&label=Discord&labelColor=%235865F2&style=for-the-badge)][discord] There is no such thing as a stupid question. If you have a question, chances are someone else does too. We encourage you to ask questions on our -[Discord](https://discord.gg/bDhQrkqNP4) server. Alternatively, you can open a -discussion on -[GitHub Discussions](https://github.com/oasysai/oasysdb/discussions) with your -question or suggestion. +[Discord][discord] server. Alternatively, you can open a discussion on [GitHub +Discussions][gh_discussions] with your question or suggestion. ## Encounter a bug? Have a feature request? If you encounter a bug or have a feature request, please open an issue on -[GitHub Issues](https://github.com/oasysai/oasysdb/issues). Please include as -much information as possible in your issue. This includes: +[GitHub Issues][gh_issues]. Please include as much information as possible in +your issue. This includes: - A description of the bug or feature request. - If it's a bug, steps to reproduce the bug. If it's a feature request, include @@ -57,23 +55,8 @@ We'd love to hear about it! Getting started with OasysDB development is pretty straightforward. First, you will need to have Rust installed on your machine. We recommend using -[rustup](https://www.rust-lang.org/tools/install) to install Rust. We also -recommend having rust-analyzer installed for your code editor for a better -development experience. - -OasysDB utilizes many third-party crates to provide its functionality. These are -some of the most important ones and the resources you can use to learn more -about them: - -- [**Apache Arrow**](https://arrow.apache.org): Arrow is a cross-language - development platform for in-memory columnar data format for efficient analytic - operations. -- [**Rayon**](https://github.com/rayon-rs/rayon): Rayon is a data parallelism - library for Rust that provides a simple and efficient API for parallelizing - computation. -- [**Tonic**](https://github.com/hyperium/tonic): Tonic is a - [gRPC](https://grpc.io/docs/) over HTTP/2 implementation focused on high - performance and flexibility built on top of the Tokio asynchronous runtime. +[rustup][rustup] to install Rust. We also recommend having rust-analyzer +installed for your code editor for a better development experience. TODO: Complete the getting started guide. @@ -81,12 +64,10 @@ TODO: Complete the getting started guide. We mostly use the default linting and style guide for Rust except for some linting changes listed in rustfmt.toml file. For more information about the code -style, see the -[Rust Style Guide](https://doc.rust-lang.org/beta/style-guide/index.html). +style, see the [Rust Style Guide][style_guide]. -For commit messages, we use the -[Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) format. -This allows us to maintain consistency and readability in our Git commit +For commit messages, we use the [Conventional Commits][conventional_commits] +format. This allows us to maintain consistency and readability in our Git commit history. When commenting your code, please try your best to write comments that are clear @@ -100,9 +81,8 @@ Once you have made your changes, you can submit a pull request. We will review your pull request and provide feedback. If your pull request is accepted, we will merge it into the main branch. -For organization purposes, we ask that you use the -[Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) format -for your pull request title in lowercase: +For organization purposes, we ask that you use the [Conventional +Commits][conventional_commits] format for your pull request title in lowercase: ``` : @@ -119,7 +99,13 @@ fix: fix issue ... Thank you for taking the time to read this documentation. We look forward to your contributions! Another way to support this project is to star this project, -share it with your circles, and join us on -[Discord](https://discord.gg/bDhQrkqNP4). +share it with your circles, and join us on [Discord][discord]. Best regards,
Edwin Kys + +[discord]: https://discord.gg/bDhQrkqNP4 +[gh_issues]: https://github.com/oasysai/oasysdb/issues +[gh_discussions]: https://github.com/oasysai/oasysdb/discussions +[rustup]: https://www.rust-lang.org/tools/install +[style_guide]: https://doc.rust-lang.org/beta/style-guide/index.html +[conventional_commits]: https://www.conventionalcommits.org/en/v1.0.0/ diff --git a/protos/database.proto b/protos/database.proto deleted file mode 100644 index 0fb30e6d..00000000 --- a/protos/database.proto +++ /dev/null @@ -1,72 +0,0 @@ -syntax = "proto3"; -package database; - -import "google/protobuf/empty.proto"; - -service Database { - rpc CreateCollection(CreateCollectionRequest) returns (google.protobuf.Empty); - rpc DeleteCollection(DeleteCollectionRequest) returns (google.protobuf.Empty); - - rpc AddFields(AddFieldsRequest) returns (google.protobuf.Empty); - rpc RemoveFields(RemoveFieldsRequest) returns (google.protobuf.Empty); - - rpc InsertRecords(InsertRecordsRequest) returns (google.protobuf.Empty); -} - -// region CreateCollection - message CreateCollectionRequest { - string name = 1; - } - -// region DeleteCollection - message DeleteCollectionRequest { - string name = 1; - } - -// region AddFields - message Field { - string name = 1; - string datatype = 2; - bool nullable = 3; - } - - message AddFieldsRequest { - string collection_name = 1; - repeated Field fields = 2; - } - -// region RemoveFields - message RemoveFieldsRequest { - string collection_name = 1; - repeated string field_names = 2; - } - -// region InsertRecords - message Record { - repeated Data data = 1; - } - - // The goal is to simulate a batch insert operation in SQL. - message InsertRecordsRequest { - string collection_name = 1; - repeated string field_names = 2; - repeated Record records = 3; - } - -// Custom reusable data types. - -message Vector { - repeated float values = 1; -} - -message Data { - // This value type should match the data type supported - // by OasysDB in the types/metadata.rs file - oneof value { - string string_value = 1; - int32 integer_value = 2; - bool boolean_value = 3; - float float_value = 4; - Vector vector_value = 5; - } -} diff --git a/src/db/collection.rs b/src/db/collection.rs deleted file mode 100644 index 93e683dc..00000000 --- a/src/db/collection.rs +++ /dev/null @@ -1,347 +0,0 @@ -use super::*; -use array::downcast_array; -use arrow::compute::concat_batches; - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CollectionState { - pub batch_size: usize, - pub count: usize, - pub dimension: usize, - pub schema: Schema, - pub dir: Directory, - /// Tracker of the next internal ID to assign to a record. - next_id: u32, -} - -impl CollectionState { - /// Creates a new collection state. - /// - `root`: Root directory for the collection. - fn new(root: PathBuf) -> Result { - let field_id = Field::new("internal_id", DataType::Int32, false); - - let vector_type = MetadataType::Vector.into(); - let field_vector = Field::new("vector", vector_type, false); - - let mut state = Self { - schema: Schema::new(vec![field_id, field_vector]), - dir: Directory::new(root), - batch_size: 1000, - count: 0, - dimension: 0, - next_id: 1, - }; - - state.create_data_file()?; - Ok(state) - } - - fn create_data_file(&mut self) -> Result { - // The filename would be something like: cdata0000001. - let index = self.dir.data_files.len() + 1; - let filename = format!("cdata{index:0>7}"); - let data_file = self.dir.root.join(filename); - - let schema_ref = Arc::new(self.schema.clone()); - - // Create a new data file with an empty record batch. - - let file = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(&data_file)?; - - let writer = BufWriter::new(file); - let mut file_writer = FileWriter::try_new(writer, &schema_ref)?; - - let record = RecordBatch::new_empty(schema_ref); - file_writer.write(&record)?; - file_writer.finish()?; - - self.dir.data_files.push(data_file.clone()); - Ok(data_file) - } -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Directory { - pub root: PathBuf, - pub state_file: PathBuf, - pub data_files: Vec, -} - -impl Directory { - fn new(root: PathBuf) -> Self { - let state_file = root.join("cstate"); - Self { root, state_file, data_files: vec![] } - } -} - -pub struct Collection { - state: Lock, -} - -impl Collection { - pub fn open(dir: PathBuf) -> Result { - if !dir.try_exists()? { - fs::create_dir_all(&dir)?; - } - - let state_file = dir.join("cstate"); - let state = if !state_file.try_exists()? { - Self::initialize_state(&dir)? - } else { - Self::read_state(&state_file)? - }; - - let state = Lock::new(state); - let collection = Self { state }; - Ok(collection) - } - - pub fn add_fields(&self, fields: impl Into) -> Result<(), Error> { - let mut state = self.state.write()?; - - // OasysDB doesn't support adding fields to a non-empty - // collection due to the nature of the indexing system. - if state.count > 0 { - let code = ErrorCode::ClientError; - let message = "Unable to add fields to a non-empty collection"; - return Err(Error::new(&code, message)); - } - - // Create a new schema with the new field. - let schema = &state.schema; - let schemas = vec![schema.clone(), Schema::new(fields)]; - let new_schema = Schema::try_merge(schemas)?; - - // Update the state and data. - state.schema = new_schema; - *state = state.clone(); - - drop(state); - self.persist_state()?; - Ok(()) - } - - pub fn remove_fields(&self, field_names: &[String]) -> Result<(), Error> { - let mut state = self.state.write()?; - let schema = &state.schema; - - // Just like adding fields, removing fields from a non-empty - // collection is not supported in OasysDB. - if state.count > 0 { - let code = ErrorCode::ClientError; - let message = "Unable to remove fields from a non-empty collection"; - return Err(Error::new(&code, message)); - } - - // OasysDB has 2 default fields which can't be removed: - // internal_id and vector. - let default = ["internal_id", "vector"]; - if field_names.iter().any(|name| default.contains(&name.as_str())) { - let code = ErrorCode::ClientError; - let message = "Unable to remove default fields"; - return Err(Error::new(&code, message)); - } - - // Check if all the fields to be removed exist in the schema. - // Abort if any of the fields do not exist. - if field_names.iter().any(|name| schema.fields.find(name).is_none()) { - let code = ErrorCode::ClientError; - let message = "One or more fields do not exist in the schema."; - return Err(Error::new(&code, message)); - } - - let fields = schema - .all_fields() - .into_iter() - .filter(|field| !field_names.contains(field.name())) - .cloned() - .collect::>(); - - // Create a new schema without the specified fields. - let new_schema = Schema::new(fields); - - // Update the state and data. - state.schema = new_schema; - *state = state.clone(); - - drop(state); - self.persist_state()?; - Ok(()) - } - - pub fn insert_records( - &self, - field_names: &[String], - records: &[Arc], - ) -> Result<(), Error> { - let mut state = self.state.write()?; - - let mut record_map: HashMap> = field_names - .iter() - .enumerate() - .map(|(i, name)| (name.clone(), records[i].clone())) - .collect(); - - // It's safe to unwrap here because the vector field has been checked in - // the database service before calling this method. - let vector_array = record_map.get("vector").unwrap(); - - let data_size = vector_array.len(); - let dimension = { - let array: ListArray = downcast_array(vector_array.as_ref()); - let vector: Float32Array = downcast_array(array.value(0).as_ref()); - vector.len() - }; - - if dimension == 0 { - let code = ErrorCode::ClientError; - let message = "Vector cannot be empty"; - return Err(Error::new(&code, message)); - } - - // If it's the first record, we need to update the dimension. - if state.count == 0 && state.dimension == 0 { - state.dimension = dimension; - } - - // Ensure all vectors have the same dimension. - self.validate_vectors(vector_array, dimension)?; - - let schema = state.schema.clone(); - let fields = schema.all_fields(); - - // Create a column array for internal_id. - let internal_id: Vec> = (state.next_id..) - .take(data_size) - .map(|id| Some(id as i32)) - .collect(); - let internal_id_array = Arc::new(Int32Array::from(internal_id)); - - record_map.insert("internal_id".to_string(), internal_id_array); - - // Check for missing fields in the record and create a - // column array for each missing field with null values. - // This is necessary to ensure that all fields are present. - let create_missing_array = |field: &Field| { - let data_type = field.data_type().clone().into(); - let array = match data_type { - MetadataType::Integer => Int32Array::null_array(data_size), - MetadataType::Float => Float32Array::null_array(data_size), - MetadataType::String => StringArray::null_array(data_size), - MetadataType::Boolean => BooleanArray::null_array(data_size), - MetadataType::Vector => ListArray::null_array(data_size), - }; - - (field.name().to_string(), array as Arc) - }; - - let missing_fields: HashMap> = fields - .into_iter() - .filter(|field| !record_map.contains_key(field.name())) - .map(create_missing_array) - .collect(); - - // Merge the missing fields with the record map. - record_map.extend(missing_fields); - - // Convert the record map to columns in order based on the schema. - let extract_array = |field: &Arc| { - let name = field.name(); - let array = record_map.get(name).unwrap(); - array.clone() - }; - - let columns = schema.fields.iter().map(extract_array).collect(); - - // Create a record batch from the record map. - let schemaref = Arc::new(schema.clone()); - let record_batch = RecordBatch::try_new(schemaref.clone(), columns)?; - - // OasysDB limits the number of record batches in a data file to 1. - // Per record batch, there can be a maximum of 1000 records by default. - - // The behavior is as follows: - // 1. If the last data file is empty, write the record batch to it. - // 2. If the last data file is not empty, combine the last record batch - // with the new record batch and write the combined record batch to - // the last data file until it reaches the batch size. - - let data_files = &mut state.dir.data_files; - let file_ops = FileOps::default(); - - // Also, we can unwrap here because the data files won't be None. - let last_data_file = data_files.last().unwrap(); - let last_record_batch = file_ops.read_ipc_file(last_data_file)?; - - let record_batch = if last_record_batch.num_rows() != 0 { - let batches = vec![&last_record_batch, &record_batch]; - concat_batches(&schemaref, batches)? - } else { - record_batch - }; - - let mut files_to_write = vec![last_data_file.clone()]; - - // This determines the number of new files to create. - // Let's say the batch size is 1000 and the combined record batch - // has 1500 records. This means we need to create 1 new file because - // the first 1000 records will be written to the last data file and - // the remaining 500 records will be written to the new file. - let num_new_file = { - let size = record_batch.num_rows(); - let remain = size.saturating_sub(state.batch_size) as f32; - let div = remain / state.batch_size as f32; - div.ceil() as usize - }; - - for _ in 0..num_new_file { - let data_file = state.create_data_file()?; - files_to_write.push(data_file); - } - - FileOps::default().write_ipc_files( - &files_to_write, - &record_batch, - state.batch_size, - )?; - - // Update and persist the state. - state.count += data_size; - state.next_id += data_size as u32; - *state = state.clone(); - - // Drop the state lock before persisting the state. - // This prevents deadlocks since persist_state also requires the lock. - drop(state); - self.persist_state()?; - - Ok(()) - } -} - -impl StateMachine for Collection { - fn initialize_state( - path: impl Into, - ) -> Result { - let state = CollectionState::new(path.into())?; - FileOps::default().write_binary_file(&state.dir.state_file, &state)?; - Ok(state) - } - - fn read_state(path: impl Into) -> Result { - FileOps::default().read_binary_file(&path.into()) - } - - fn state(&self) -> Result { - Ok(self.state.read()?.clone()) - } - - fn persist_state(&self) -> Result<(), Error> { - let state = self.state.read()?.clone(); - let file_ops = FileOps::default(); - file_ops.write_binary_file(&state.dir.state_file, &state) - } -} diff --git a/src/db/collection_utils.rs b/src/db/collection_utils.rs deleted file mode 100644 index 07fe3852..00000000 --- a/src/db/collection_utils.rs +++ /dev/null @@ -1,54 +0,0 @@ -use super::*; -use array::downcast_array; -use regex::Regex; - -impl Collection { - /// Validates the name of collections or fields. - pub fn validate_name(name: &str) -> Result<(), Error> { - if name.is_empty() { - let code = ErrorCode::ClientError; - let message = "Name cannot be empty"; - return Err(Error::new(&code, message)); - } - - // We only allow lowercase letters and underscores in the names. - // Also, we can unwrap here because the regex pattern is hardcoded. - let re = Regex::new(r"^[a-z0-9_]+$").unwrap(); - if !re.is_match(name) { - return Err(Error::new( - &ErrorCode::ClientError, - "Name must be lowercase letters with underscores.", - )); - } - - Ok(()) - } - - /// Validates the vectors given a column array consisting of vectors. - /// This ensures that all vectors provided have the same dimension. - pub fn validate_vectors( - &self, - vectors: &Arc, - dimension: usize, - ) -> Result<(), Error> { - let vector_array: ListArray = downcast_array(vectors.as_ref()); - - let is_dimension_mismatch = |array: Arc| { - let vector: Float32Array = downcast_array(array.as_ref()); - vector.len() != dimension - }; - - let dimension_mismatch = vector_array.iter().any(|array| match array { - Some(array) => is_dimension_mismatch(array), - None => true, - }); - - if dimension_mismatch { - let code = ErrorCode::ClientError; - let message = "Vectors must have the same dimension."; - return Err(Error::new(&code, message)); - } - - Ok(()) - } -} diff --git a/src/db/database.rs b/src/db/database.rs deleted file mode 100644 index 07c345c0..00000000 --- a/src/db/database.rs +++ /dev/null @@ -1,176 +0,0 @@ -use super::*; -use uuid::Uuid; - -#[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub struct DatabaseState { - pub collection_refs: HashMap, -} - -struct Directory { - pub collections_dir: PathBuf, - pub state_file: PathBuf, -} - -impl Directory { - fn new(root: PathBuf) -> Self { - let collections_dir = root.join("collections"); - let state_file = root.join("dbstate"); - Self { collections_dir, state_file } - } -} - -pub struct Database { - dir: Directory, - state: Lock, -} - -impl Database { - pub fn open(dir: PathBuf) -> Result { - let dir = Directory::new(dir); - - let state_file = &dir.state_file; - let state = if !state_file.try_exists()? { - // Creating a collection directory will create the root directory. - fs::create_dir_all(&dir.collections_dir)?; - Self::initialize_state(state_file)? - } else { - Self::read_state(state_file)? - }; - - let state = Lock::new(state); - let db = Self { dir, state }; - Ok(db) - } -} - -// This implementation block contains methods used by the gRPC server. -// We do this to make it easier to test the database logic. -impl Database { - pub fn _create_collection(&self, name: &str) -> Result<(), Error> { - Collection::validate_name(name)?; - - // Check if the collection already exists. - let mut state = self.state.write()?; - if state.collection_refs.contains_key(name) { - let code = ErrorCode::ClientError; - let message = format!("Collection already exists: {name}"); - return Err(Error::new(&code, &message)); - } - - // Create the collection directory. - let uuid = Uuid::new_v4().to_string(); - let collection_dir = self.dir.collections_dir.join(uuid); - - // Initialize the collection. - Collection::open(collection_dir.to_path_buf())?; - - // Update the database state. - state.collection_refs.insert(name.to_string(), collection_dir); - *state = state.clone(); - - // Drop the lock to prevent deadlocks since - // persist_state also requires the lock. - drop(state); - - self.persist_state()?; - Ok(()) - } - - pub fn _get_collection(&self, name: &str) -> Result { - let state = self.state.read()?; - - if name.is_empty() { - let code = ErrorCode::ClientError; - let message = "Collection name cannot be empty"; - return Err(Error::new(&code, message)); - } - - // Get the directory where the collection is - // persisted from the database state. - let dir = match state.collection_refs.get(name) { - Some(dir) => dir.clone(), - None => { - let code = ErrorCode::NotFoundError; - let message = format!("Collection not found: {name}"); - return Err(Error::new(&code, &message)); - } - }; - - Collection::open(dir) - } - - pub fn _delete_collection(&self, name: &str) -> Result<(), Error> { - let mut state = self.state.write()?; - - // This makes the method idempotent. - if !state.collection_refs.contains_key(name) { - return Ok(()); - } - - // Delete the collection directory. - // We can unwrap here because we checked if the collection exists. - let collection_dir = state.collection_refs.remove(name).unwrap(); - fs::remove_dir_all(collection_dir)?; - - // Update the database state. - *state = state.clone(); - drop(state); - - self.persist_state()?; - Ok(()) - } - - pub fn _add_fields( - &self, - collection_name: &str, - fields: impl Into, - ) -> Result<(), Error> { - let collection = self._get_collection(collection_name)?; - collection.add_fields(fields)?; - Ok(()) - } - - pub fn _remove_fields( - &self, - collection_name: &str, - field_names: &[String], - ) -> Result<(), Error> { - let collection = self._get_collection(collection_name)?; - collection.remove_fields(field_names)?; - Ok(()) - } - - pub fn _insert_records( - &self, - collection_name: &str, - field_names: &[String], - records: &[Arc], - ) -> Result<(), Error> { - let collection = self._get_collection(collection_name)?; - collection.insert_records(field_names, records)?; - Ok(()) - } -} - -impl StateMachine for Database { - fn initialize_state( - path: impl Into, - ) -> Result { - let state = DatabaseState::default(); - FileOps::default().write_binary_file(&path.into(), &state)?; - Ok(state) - } - - fn read_state(path: impl Into) -> Result { - FileOps::default().read_binary_file(&path.into()) - } - - fn state(&self) -> Result { - Ok(self.state.read()?.clone()) - } - - fn persist_state(&self) -> Result<(), Error> { - let state = self.state.read()?.clone(); - FileOps::default().write_binary_file(&self.dir.state_file, &state) - } -} diff --git a/src/db/database_service.rs b/src/db/database_service.rs deleted file mode 100644 index 36cac474..00000000 --- a/src/db/database_service.rs +++ /dev/null @@ -1,131 +0,0 @@ -use super::*; -use proto::database_server::Database as ProtoDatabase; - -#[tonic::async_trait] -impl ProtoDatabase for Arc { - async fn create_collection( - &self, - request: Request, - ) -> Result, Status> { - let request = request.into_inner(); - self._create_collection(&request.name)?; - Ok(Response::new(())) - } - - async fn delete_collection( - &self, - request: Request, - ) -> Result, Status> { - let request = request.into_inner(); - self._delete_collection(&request.name)?; - Ok(Response::new(())) - } - - async fn add_fields( - &self, - request: Request, - ) -> Result, Status> { - let request = request.into_inner(); - - // Construct Arrow fields from the request fields. - let mut fields = vec![]; - for field in request.fields { - Collection::validate_name(&field.name)?; - - // Use the MetadataType as a proxy to convert string to DataType. - let metadata_type: MetadataType = field.datatype.into(); - let datatype: DataType = metadata_type.into(); - - let new_field = Field::new(&field.name, datatype, true); - fields.push(new_field); - } - - self._add_fields(&request.collection_name, fields)?; - Ok(Response::new(())) - } - - async fn remove_fields( - &self, - request: Request, - ) -> Result, Status> { - let request = request.into_inner(); - self._remove_fields(&request.collection_name, &request.field_names)?; - Ok(Response::new(())) - } - - async fn insert_records( - &self, - request: Request, - ) -> Result, Status> { - let proto::InsertRecordsRequest { - collection_name, - field_names, - records, - } = request.into_inner(); - - if field_names.is_empty() { - return Err(Status::invalid_argument( - "At least one field name must be specified.", - )); - } - - if !field_names.contains(&"vector".to_string()) { - return Err(Status::invalid_argument( - "The vector field must be specified.", - )); - } - - // Check if the records provided match the number of fields. - // This is required since we try to simulate a batch insert like: - // INSERT INTO collection_name (field1, field2) - // VALUES - // (x1, y1), - // (x2, y2, z2) <- We should catch this error. - if records - .par_iter() - .any(|record| record.data.len() != field_names.len()) - { - let message = "The number of values must match the fields."; - return Err(Status::invalid_argument(message)); - } - - let collection = self._get_collection(&collection_name)?; - let schema = collection.state()?.schema; - let fields = schema.fields; - - // Check if the fields specified in the request exist in the schema. - if field_names.par_iter().any(|name| fields.find(name).is_none()) { - return Err(Status::invalid_argument( - "One or more fields specified do not exist in the schema.", - )); - } - - // Convert records from row format to column format. - let mut columns = vec![vec![]; field_names.len()]; - for record in records { - for (i, column) in columns.iter_mut().enumerate() { - let value = record.data[i].value.clone(); - column.push(value); - } - } - - // Convert columns to Arrow arrays. - let mut arrays = vec![]; - for i in 0..field_names.len() { - let field = fields.find(&field_names[i]).unwrap().1; - let column = columns[i].clone(); - let array = match field.data_type().clone().into() { - MetadataType::Boolean => BooleanArray::from_values(column)?, - MetadataType::Integer => Int32Array::from_values(column)?, - MetadataType::Float => Float32Array::from_values(column)?, - MetadataType::String => StringArray::from_values(column)?, - MetadataType::Vector => ListArray::from_values(column)?, - }; - - arrays.push(array); - } - - self._insert_records(&collection_name, &field_names, &arrays)?; - Ok(Response::new(())) - } -} diff --git a/src/db/mod.rs b/src/db/mod.rs index 52c6b85e..8b137891 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,216 +1 @@ -use crate::proto; -use crate::types::*; -use array::{BooleanArray, Float32Array, Int32Array, ListArray, StringArray}; -use arrow::array::{self, Array}; -use arrow::datatypes::DataType; -use arrow::ipc::writer::FileWriter; -use arrow::record_batch::RecordBatch; -use arrow_schema::{Field, Fields, Schema}; -use rayon::prelude::*; -use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::fs::{self, OpenOptions}; -use std::io::BufWriter; -use std::path::PathBuf; -use std::sync::{Arc, RwLock as Lock}; -use tonic::{Request, Response, Status}; -mod collection; -mod collection_utils; -mod database; -mod database_service; - -pub use collection::*; -pub use database::*; - -type ProtoValue = proto::data::Value; - -/// A trait for objects that own a state that should be persisted to disk. -/// - `T`: Type of the state object. -/// -/// Please refer to the implementation of the StateMachine trait for -/// Database and Collection for more details. -pub trait StateMachine { - /// Initializes the state object and persists it to a file. - /// This method should be called only once when the object is created. - fn initialize_state(path: impl Into) -> Result; - - /// Reads the state object from a file. - fn read_state(path: impl Into) -> Result; - - /// Returns a reference to the state object. - fn state(&self) -> Result; - - /// Persists the state object to a file. - fn persist_state(&self) -> Result<(), Error>; -} - -pub trait ArrayUtils { - fn from_values( - values: Vec>, - ) -> Result, Error>; - - /// Creates an array filled with null values. - fn null_array(len: usize) -> Arc; -} - -pub trait ListArrayUtils { - fn from_vectors(values: Vec>) -> Arc; -} - -impl ArrayUtils for BooleanArray { - fn from_values( - values: Vec>, - ) -> Result, Error> { - let parse_boolean = |value: Option| match value { - Some(ProtoValue::BooleanValue(value)) => Some(value), - _ => None, - }; - - let values: Vec> = - values.into_par_iter().map(parse_boolean).collect(); - Ok(Arc::new(BooleanArray::from(values))) - } - - fn null_array(len: usize) -> Arc { - Arc::new(BooleanArray::from(vec![None; len])) - } -} - -impl ArrayUtils for Float32Array { - fn from_values( - values: Vec>, - ) -> Result, Error> { - let parse_float = |value: Option| match value { - Some(ProtoValue::FloatValue(value)) => Some(value), - _ => None, - }; - - let values: Vec> = - values.into_par_iter().map(parse_float).collect(); - Ok(Arc::new(Float32Array::from(values))) - } - - fn null_array(len: usize) -> Arc { - Arc::new(Float32Array::from(vec![None; len])) - } -} - -impl ArrayUtils for Int32Array { - fn from_values( - values: Vec>, - ) -> Result, Error> { - let parse_int = |value: Option| match value { - Some(ProtoValue::IntegerValue(value)) => Some(value), - _ => None, - }; - - let values: Vec> = - values.into_par_iter().map(parse_int).collect(); - Ok(Arc::new(Int32Array::from(values))) - } - - fn null_array(len: usize) -> Arc { - Arc::new(Int32Array::from(vec![None; len])) - } -} - -impl ArrayUtils for StringArray { - fn from_values( - values: Vec>, - ) -> Result, Error> { - let parse_string = |value: Option| match value { - Some(ProtoValue::StringValue(value)) => Some(value), - _ => None, - }; - - let values: Vec> = - values.into_par_iter().map(parse_string).collect(); - Ok(Arc::new(StringArray::from(values))) - } - - fn null_array(len: usize) -> Arc { - let source: Vec> = vec![None; len]; - Arc::new(StringArray::from(source)) - } -} - -impl ArrayUtils for ListArray { - fn from_values( - values: Vec>, - ) -> Result, Error> { - let parse_vector = |value: Option| match value { - Some(ProtoValue::VectorValue(value)) => Some(value.values), - _ => None, - }; - - let values: Vec>> = - values.into_par_iter().map(parse_vector).collect(); - - // Find the dimension of the vector. - let dimension = values - .clone() - .into_par_iter() - .map(|value| value.unwrap_or_default().len()) - .max() - // 1024 is the default capacity for generic array builders. - .unwrap_or(1024); - - // Create builders to construct the ListArray. - let mut list_builder = { - let float_builder = Float32Array::builder(dimension); - let field = Field::new("element", DataType::Float32, false); - array::ListBuilder::new(float_builder).with_field(field) - }; - - // Insert values into the builder. - for value in values { - match value { - Some(values) => { - list_builder.values().append_slice(&values); - list_builder.append(true); - } - None => list_builder.append(false), - } - } - - let array = list_builder.finish(); - Ok(Arc::new(array)) - } - - fn null_array(len: usize) -> Arc { - let mut builder = { - // We can use 0 capacity since we are not going to append any values. - let float_builder = Float32Array::builder(0); - let field = Field::new("element", DataType::Float32, false); - array::ListBuilder::new(float_builder).with_field(field) - }; - - for _ in 0..len { - builder.append(false); - } - - let array = builder.finish(); - Arc::new(array) - } -} - -impl ListArrayUtils for ListArray { - fn from_vectors(values: Vec>) -> Arc { - let dimension = values[0].len(); - - let mut list_builder = { - let float_builder = Float32Array::builder(dimension); - let field = Field::new("element", DataType::Float32, false); - array::ListBuilder::new(float_builder).with_field(field) - }; - - for value in values { - list_builder.values().append_slice(&value); - list_builder.append(true); - } - - let array = list_builder.finish(); - Arc::new(array) - } -} diff --git a/src/indices/mod.rs b/src/indices/mod.rs new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/indices/mod.rs @@ -0,0 +1 @@ + diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 00000000..87c27719 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,2 @@ +#[cfg(test)] +mod tests; diff --git a/src/main.rs b/src/main.rs index f89f5708..f328e4d9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,32 +1 @@ -#![allow(clippy::enum_variant_names)] - -mod db; -mod proto; -mod types; - -#[cfg(test)] -mod tests; - -use db::*; -use proto::database_server::DatabaseServer; -use std::path::PathBuf; -use std::sync::Arc; -use tonic::transport::Server; - -const HOST: &str = "0.0.0.0"; -const PORT: u16 = 2525; - -#[tokio::main] -async fn main() -> Result<(), Box> { - let addr = format!("{HOST}:{PORT}").parse()?; - - let path = PathBuf::from("odb_data"); - let database = Arc::new(Database::open(path)?); - - Server::builder() - .add_service(DatabaseServer::new(database)) - .serve(addr) - .await?; - - Ok(()) -} +fn main() {} diff --git a/src/proto.rs b/src/proto.rs deleted file mode 100644 index 3b00fad1..00000000 --- a/src/proto.rs +++ /dev/null @@ -1 +0,0 @@ -tonic::include_proto!("database"); diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 94792833..8b137891 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,67 +1 @@ -use crate::db::*; -use crate::types::*; -use arrow::array::{self, Array}; -use arrow::datatypes::{DataType, Field}; -use rand::random; -use std::fs; -use std::path::PathBuf; -use std::sync::Arc; -mod stress_test_database; -mod test_database; - -const TEST_DIR: &str = "odb_data"; -const TEST_COLLECTION: &str = "collection"; - -fn create_test_database() -> Result { - // Reset the database directory for testing. - let path = PathBuf::from(TEST_DIR); - if path.exists() { - fs::remove_dir_all(&path)?; - } - - // The database should have some subdirectories. - let db = Database::open(path.clone())?; - let content = path.read_dir()?; - assert!(content.count() == 2); - - // Create a test collection. - db._create_collection(TEST_COLLECTION)?; - - // Add a couple of fields to the collection. - let field_title = Field::new("title", DataType::Utf8, true); - let field_year = Field::new("year", DataType::Int32, true); - db._add_fields(TEST_COLLECTION, vec![field_title, field_year])?; - - Ok(db) -} - -fn create_test_database_with_data() -> Result { - let db = create_test_database()?; - populate_database(db) -} - -fn generate_random_vectors(dimension: usize, len: usize) -> Vec> { - (0..len) - .map(|_| (0..dimension).map(|_| random::()).collect()) - .collect() -} - -fn populate_database(database: Database) -> Result { - let fields = ["vector", "title", "year"]; - let field_names: Vec = - fields.iter().map(|f| f.to_string()).collect(); - - let vectors = generate_random_vectors(128, 3); - let titles = vec!["The Matrix", "Avatar", "Inception"]; - let years = vec![1999, 2009, 2010]; - - let records = vec![ - Arc::new(array::ListArray::from_vectors(vectors)) as Arc, - Arc::new(array::StringArray::from(titles)) as Arc, - Arc::new(array::Int32Array::from(years)) as Arc, - ]; - - database._insert_records(TEST_COLLECTION, &field_names, &records)?; - Ok(database) -} diff --git a/src/tests/stress_test_database.rs b/src/tests/stress_test_database.rs deleted file mode 100644 index 1b5267db..00000000 --- a/src/tests/stress_test_database.rs +++ /dev/null @@ -1,32 +0,0 @@ -use super::*; - -const RECORDS_LEN: usize = 10_000; - -#[test] -fn test_database_insert_many_records() -> Result<(), Error> { - let path = PathBuf::from(TEST_DIR); - if path.exists() { - fs::remove_dir_all(&path)?; - } - - let database = Database::open(path)?; - - let collection_name = "collection"; - database._create_collection(collection_name)?; - - let fields = vec!["vector".to_string()]; - let vectors = generate_random_vectors(128, RECORDS_LEN); - let records = vec![ - Arc::new(array::ListArray::from_vectors(vectors)) as Arc - ]; - - database._insert_records(collection_name, &fields, &records)?; - - let state = database.state()?; - let collection_dir = &state.collection_refs[collection_name]; - let collection = Collection::open(collection_dir.clone())?; - let state = collection.state()?; - assert_eq!(state.count, RECORDS_LEN); - - Ok(()) -} diff --git a/src/tests/test_database.rs b/src/tests/test_database.rs deleted file mode 100644 index 0525f509..00000000 --- a/src/tests/test_database.rs +++ /dev/null @@ -1,90 +0,0 @@ -use super::*; - -#[test] -fn test_database_create_collection() -> Result<(), Error> { - let db = create_test_database()?; - let name = "new_collection"; - db._create_collection(name)?; - - let state = db.state()?; - assert!(state.collection_refs.contains_key(name)); - Ok(()) -} - -#[test] -fn test_database_delete_collection() -> Result<(), Error> { - let db = create_test_database()?; - db._delete_collection(TEST_COLLECTION)?; - - let state = db.state()?; - assert!(!state.collection_refs.contains_key(TEST_COLLECTION)); - Ok(()) -} - -#[test] -fn test_database_add_fields() -> Result<(), Error> { - let database = create_test_database()?; - let state = database.state()?; - let dir = &state.collection_refs[TEST_COLLECTION]; - - let field = Field::new("id", DataType::Utf8, false); - database._add_fields(TEST_COLLECTION, vec![field])?; - - let collection = Collection::open(dir.clone())?; - let schema = collection.state()?.schema; - assert!(schema.fields().find("id").is_some()); - - Ok(()) -} - -#[test] -#[should_panic] -fn test_database_remove_default_fields() { - let database = create_test_database().unwrap(); - let fields = ["internal_id".to_string()]; - database._remove_fields(TEST_COLLECTION, &fields).unwrap(); -} - -#[test] -fn test_database_remove_fields() -> Result<(), Error> { - let database = create_test_database()?; - let state = database.state()?; - let dir = &state.collection_refs[TEST_COLLECTION]; - - let fields = ["title".to_string()]; - database._remove_fields(TEST_COLLECTION, &fields)?; - - let collection = Collection::open(dir.clone())?; - let schema = collection.state()?.schema; - assert!(schema.fields().find("title").is_none()); - - Ok(()) -} - -#[test] -fn test_database_insert_records() -> Result<(), Error> { - let database = create_test_database_with_data()?; - let state = database.state()?; - let dir = &state.collection_refs[TEST_COLLECTION]; - - let fields = ["vector", "title", "year"]; - let fields: Vec = fields.iter().map(|f| f.to_string()).collect(); - - let vectors = generate_random_vectors(128, 2); - let titles = vec!["Interstellar", "Avengers: Endgame"]; - let years = vec![2014, 2019]; - - let records = vec![ - Arc::new(array::ListArray::from_vectors(vectors)) as Arc, - Arc::new(array::StringArray::from(titles)) as Arc, - Arc::new(array::Int32Array::from(years)) as Arc, - ]; - - database._insert_records(TEST_COLLECTION, &fields, &records)?; - - let collection = Collection::open(dir.clone())?; - let state = collection.state()?; - assert_eq!(state.count, 5); - - Ok(()) -} diff --git a/src/types/error.rs b/src/types/error.rs deleted file mode 100644 index 554878b8..00000000 --- a/src/types/error.rs +++ /dev/null @@ -1,94 +0,0 @@ -use std::fmt::{Display, Formatter, Result}; - -// Other error types. -use arrow::error::ArrowError; -use bincode::ErrorKind as BincodeError; -use std::error::Error as StandardError; -use std::io::Error as IOError; -use std::sync::PoisonError; - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ErrorCode { - ArrowError, - ConcurrencyError, - FileError, - SerializationError, - StandardError, - - // Tonic-related error codes. - ClientError, - NotFoundError, -} - -#[derive(Debug)] -pub struct Error { - pub code: ErrorCode, - pub message: String, -} - -impl Error { - pub fn new(code: &ErrorCode, message: &str) -> Self { - Self { code: *code, message: message.to_string() } - } -} - -impl Display for Error { - fn fmt(&self, f: &mut Formatter) -> Result { - let code = &self.code; - let message = &self.message; - write!(f, "{code:?}: {message}") - } -} - -// Implement interoperability FROM other external error types. - -impl StandardError for Error {} - -impl From> for Error { - fn from(err: Box) -> Self { - let code = ErrorCode::StandardError; - Error::new(&code, &err.to_string()) - } -} - -impl From> for Error { - fn from(err: PoisonError) -> Self { - let code = ErrorCode::ConcurrencyError; - Error::new(&code, &err.to_string()) - } -} - -impl From for Error { - fn from(err: IOError) -> Self { - let code = ErrorCode::FileError; - Error::new(&code, &err.to_string()) - } -} - -impl From for Error { - fn from(err: ArrowError) -> Self { - let code = ErrorCode::ArrowError; - Error::new(&code, &err.to_string()) - } -} - -impl From> for Error { - fn from(err: Box) -> Self { - let code = ErrorCode::SerializationError; - Error::new(&code, &err.to_string()) - } -} - -// Implement interoperability INTO other external error types. - -impl From for tonic::Status { - fn from(err: Error) -> Self { - let code = match err.code { - ErrorCode::ClientError => tonic::Code::InvalidArgument, - ErrorCode::NotFoundError => tonic::Code::NotFound, - _ => tonic::Code::Internal, - }; - - tonic::Status::new(code, err.message) - } -} diff --git a/src/types/file.rs b/src/types/file.rs deleted file mode 100644 index 145037eb..00000000 --- a/src/types/file.rs +++ /dev/null @@ -1,147 +0,0 @@ -use super::error::{Error, ErrorCode}; -use arrow::array::RecordBatch; -use arrow::ipc::reader::FileReader; -use arrow::ipc::writer::FileWriter; -use serde::de::DeserializeOwned; -use serde::Serialize; -use std::cmp::min; -use std::env; -use std::fs::{self, OpenOptions}; -use std::io::{BufReader, BufWriter}; -use std::path::PathBuf; - -/// A utility struct for reading and writing files. -pub struct FileOps { - tmp_dir: PathBuf, -} - -impl Default for FileOps { - fn default() -> Self { - let tmp_dir = env::temp_dir().join("oasysdb"); - Self::new(tmp_dir) - } -} - -impl FileOps { - pub fn new(tmp_dir: PathBuf) -> Self { - if !tmp_dir.exists() { - fs::create_dir_all(&tmp_dir) - .expect("Unable to create a temporary directory.") - } - - Self { tmp_dir } - } - - /// Reads a binary file and deserialize it into a type. - pub fn read_binary_file( - &self, - path: &PathBuf, - ) -> Result { - let file = OpenOptions::new().read(true).open(path)?; - let reader = BufReader::new(file); - bincode::deserialize_from(reader).map_err(Into::into) - } - - /// Serializes a type and write it to a binary file. - /// - /// The file is written to a temporary file first, then renamed - /// to make sure that the file is not corrupted if the operation fails. - pub fn write_binary_file( - &self, - path: &PathBuf, - data: &T, - ) -> Result<(), Error> { - let filename = self.parse_file_name(path)?; - - // Write the data to a temporary file first. - // If this fails, the original file will not be overwritten. - let tmp_path = self.tmp_dir.join(filename); - let file = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(&tmp_path)?; - - let writer = BufWriter::new(file); - bincode::serialize_into(writer, data)?; - - // If the serialization is successful, rename the temporary file. - fs::rename(&tmp_path, path)?; - Ok(()) - } - - pub fn read_ipc_file(&self, path: &PathBuf) -> Result { - let file = OpenOptions::new().read(true).open(path)?; - let reader = BufReader::new(file); - let ipc_reader = FileReader::try_new(reader, None)?; - let schema = ipc_reader.schema(); - - // In OasyDB, there will be only one record batch per file. - let record_batch = match ipc_reader.last() { - Some(batch) => batch?, - _ => RecordBatch::new_empty(schema), - }; - - Ok(record_batch) - } - - pub fn write_ipc_files( - &self, - paths: &[PathBuf], - data: &RecordBatch, - batch_size: usize, - ) -> Result<(), Error> { - let create_tmp_path = |path: &PathBuf| { - let filename = self.parse_file_name(path).unwrap(); - self.tmp_dir.join(filename) - }; - - let tmp_paths: Vec = - paths.iter().map(create_tmp_path).collect(); - - let schema = data.schema(); - - for (i, tmp_path) in tmp_paths.iter().enumerate() { - let file = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(tmp_path)?; - - let writer = BufWriter::new(file); - let mut ipc_writer = FileWriter::try_new(writer, &schema)?; - - // This attempts to write the record batch in chunks. - // This is useful when the record batch is larger than - // the predefined batch size. - let batch = { - let offset = i * batch_size; - let length = min(batch_size, data.num_rows() - offset); - data.slice(offset, length) - }; - - // Write the record batch to the file. - ipc_writer.write(&batch)?; - ipc_writer.finish()?; - } - - // If the serialization is successful, rename the temporary file. - for i in 0..tmp_paths.len() { - fs::rename(&tmp_paths[i], &paths[i])?; - } - - Ok(()) - } - - /// Parses a file name from a path. - pub fn parse_file_name(&self, path: &PathBuf) -> Result { - path.file_name() - .and_then(|name| name.to_str()) - .map(|name| name.to_string()) - .ok_or_else(|| { - let code = ErrorCode::FileError; - let message = format!("Invalid file name from path: {path:?}"); - Error::new(&code, &message) - }) - } -} diff --git a/src/types/metadata.rs b/src/types/metadata.rs deleted file mode 100644 index a798789f..00000000 --- a/src/types/metadata.rs +++ /dev/null @@ -1,66 +0,0 @@ -use arrow_schema::{DataType, Field}; - -/// Data types supported in OasysDB Arrow fields. -#[derive(Debug, Clone, PartialEq)] -pub enum MetadataType { - Integer, - Float, - String, - Boolean, - Vector, -} - -// Available OasysDB data types in string form. -// This constant prevents typos in the code. -const INTEGER: &str = "integer"; -const FLOAT: &str = "float"; -const STRING: &str = "string"; -const BOOLEAN: &str = "boolean"; -const VECTOR: &str = "vector"; - -// Implement interoperability FROM and INTO other data types. - -impl From<&str> for MetadataType { - fn from(value: &str) -> Self { - match value { - INTEGER => MetadataType::Integer, - FLOAT => MetadataType::Float, - STRING => MetadataType::String, - BOOLEAN => MetadataType::Boolean, - VECTOR => MetadataType::Vector, - _ => panic!("Unsupported metadata type: {value}"), - } - } -} - -impl From for MetadataType { - fn from(value: String) -> Self { - MetadataType::from(value.as_str()) - } -} - -impl From for MetadataType { - fn from(value: DataType) -> Self { - match value { - DataType::Int32 => MetadataType::Integer, - DataType::Float32 => MetadataType::Float, - DataType::Utf8 => MetadataType::String, - DataType::Boolean => MetadataType::Boolean, - DataType::List(_) => MetadataType::Vector, - _ => panic!("Unsupported data type: {value}"), - } - } -} - -impl From for DataType { - fn from(value: MetadataType) -> Self { - let field_float = Field::new("element", DataType::Float32, false); - match value { - MetadataType::Integer => DataType::Int32, - MetadataType::Float => DataType::Float32, - MetadataType::String => DataType::Utf8, - MetadataType::Boolean => DataType::Boolean, - MetadataType::Vector => DataType::List(field_float.into()), - } - } -} diff --git a/src/types/mod.rs b/src/types/mod.rs index f42eb711..8b137891 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,7 +1 @@ -mod error; -mod file; -mod metadata; -pub use error::*; -pub use file::*; -pub use metadata::*; From f0e186b8f4226dfb816ac9ab993521c0f88846e9 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Thu, 4 Jul 2024 20:50:35 -0500 Subject: [PATCH 39/88] chore: remove terms docs --- docs/concepts/terms.md | 179 ----------------------------------------- mkdocs.yml | 1 - 2 files changed, 180 deletions(-) delete mode 100644 docs/concepts/terms.md diff --git a/docs/concepts/terms.md b/docs/concepts/terms.md deleted file mode 100644 index 89c353f5..00000000 --- a/docs/concepts/terms.md +++ /dev/null @@ -1,179 +0,0 @@ -# Terms - -If you're new to RAG, vector search, and related concepts, this documentation -will guide you through the key terms and principles used in modern LLM-based -applications. - -This documentation attempts to provide a very high-level overview of the key -concepts and terms used in the LLM ecosystem. For a more in-depth understanding, -we recommend reading other dedicated resources. - -With that said, let's get started! - -## Embedding - -Embedding is a way to represent unstructured data as numbers to capture the -semantic meaning of the data. In the context of LLMs, embeddings are used to -represent words, sentences, or documents. - -Let's say we have a couple of words that we want to represent as numbers. For -simplicity, we will only consider 2 aspects of the words: edibility and -affordability. - -| Word | Edibility | Affordability | Label | -| ------ | --------- | ------------- | ------------ | -| Apple | 0.9 | 0.8 | Fruit | -| Apple | 0.0 | 0.0 | Tech Company | -| Banana | 0.8 | 0.8 | ? | - -In the table above, we can roughly deduce that the first apple is a fruit, while -the second apple refers to a tech company. If we were to deduce if the banana -here is a fruit or a tech company we never heard about, we could roughly say -that it's a fruit since it has similar edibility and affordability values as the -first apple. - -In practice, embeddings are much more complex and have many more dimensions, -often capturing various semantic properties beyond simple attributes like -edibility and affordability. For instance, embeddings in models like Word2Vec, -GloVe, BERT, or GPT-3 can have hundreds or thousands of dimensions. These -embeddings are learned by neural networks and are used in numerous applications, -such as search engines, recommendation systems, sentiment analysis, and machine -translation. - -Moreover, modern LLMs use contextual embeddings, meaning the representation of a -word depends on the context in which it appears. This allows the model to -distinguish between different meanings of the same word based on its usage in a -sentence. - -Note that embedding and vector are often used interchangeably in the context of -LLMs. - -## Indexing - -Indexing is the process of organizing and storing data to optimize search and -retrieval efficiency. In the context of RAG and vector search, indexing -organizes data based on their embeddings. - -Let's consider 4 data points below with their respective embeddings representing -features: alive and edible. - -| ID | Embedding | Data | -| --- | ---------- | ------ | -| 1 | [0.0, 0.8] | Apple | -| 2 | [0.0, 0.7] | Banana | -| 3 | [1.0, 0.4] | Dog | -| 4 | [0.0, 0.0] | BMW | - -To illustrate simple indexing, let's use a simplified version of the NSW -(Navigable Small World) algorithm. This algorithm establishes links between data -points based on the distances between their embeddings: - -``` -1 -> 2, 3 -2 -> 1, 3 -3 -> 2, 4 -4 -> 3, 2 -``` - -## ANNS - -ANNS is a technique for efficiently finding the nearest data points to a given -query, albeit approximately. While it may not always return the exact nearest -data points, ANNS provides results that are close enough. This probabilistic -approach balances accuracy with efficiency. - -Let's take the index we have created in the previous section as an example. -Imagine we have a query with these specific constraints: - -- Find the closest data to [0.0, 0.9]. -- Calculate a maximum of 2 distances using the Euclidean distance formula. - -Here's how we can utilize the index to find the closest data point based on this -constraint: - -1. We start at a random data point, say 4, which is linked to 3 and 2. -2. We calculate the distances and find that 2 is closer to [0.0, 0.9] than 3. -3. We determine that the closest data to [0.0, 0.9] is Banana. - -This method isn't perfect; in this case, the actual closest data point to [0.0, -0.9] is Apple. But, under these constraints, linear search would rely heavily on -chance to find the nearest data point. Indexing mitigates this issue by -efficiently narrowing down the search based on data embeddings. - -In real-world applications with millions of data points, linear search becomes -impractical. Indexing, however, enables swift retrieval by structuring data -intelligently according to their embeddings. - -Note that for managing billions of data points, sophisticated disk-based -indexing algorithms may be necessary to ensure efficient data handling. - -## RAG - -RAG (Retrieval-Augmented Generation) is a framework that combines information -retrieval and large language models (LLMs) to generate high-quality, -contextually relevant responses to user queries. This approach enhances the -capabilities of LLMs by incorporating relevant information retrieved from -external sources into the model's input. - -In practice, RAG works by retrieving relevant information from a vector -database, which allows efficient searching for the most relevant data based on -the user query. This retrieved information is then inserted into the input -context of the language model, providing it with additional knowledge to -generate more accurate and informative responses. - -Below is an example of a prompt with and without RAG in a simple Q&A scenario: - -=== "Without RAG" - - ```text - What is the name of my dog? - ``` - - > LLM: I don't know. - -=== "With RAG" - - ```text - Based on the context below: - I have a dog named Pluto. - - Answer the following question: What is the name of my dog? - ``` - - > LLM: The name of your dog is Pluto. - -By integrating retrieval with generation, RAG significantly improves the -performance of LLMs in tasks that require specific, up-to-date, or external -information, making it a powerful tool for various applications such as customer -support, knowledge management, and content generation. - -## Token - -A token is a unit of text that AI models use to process and understand natural -language. Tokens can be words, subwords, or characters, depending on the model's -architecture. Tokenization is a crucial preprocessing step in natural language -processing (NLP) and is essential for breaking down text into manageable pieces -that the model can process. - -In this example, we'll use `WordPunctTokenizer` from the NLTK library to -tokenize the sentence: "OasysDB is awesome." - -```py -from nltk.tokenize import WordPunctTokenizer - -tokenizer = WordPunctTokenizer() -tokens = tokenizer.tokenize("OasysDB is awesome.") -print(tokens) -``` - -```py -["OasysDB", "is", "awesome", "."] -``` - -Tokenization plays a big role in LLMs and embedding models. Understanding -tokenization can help in various aspects, such as optimizing model performance -and managing costs. - -Since many AI service providers charge based on the number of tokens processed. -So, you'll often encounter this term when working with LLMs and embedding -models, especially when determining the pricing of using a specific model. diff --git a/mkdocs.yml b/mkdocs.yml index 7a2ad4a6..a1e620c9 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -59,7 +59,6 @@ extra_css: nav: - Home: - Introduction: index.md - - Terms: concepts/terms.md - Other: - Changelog: changelog.md From 9b11876c983003c6b23d3b01702c88fa6f79f95a Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sun, 7 Jul 2024 01:03:25 -0500 Subject: [PATCH 40/88] feat: add initial database and data source connection --- Cargo.lock | 1608 ++++++++++++++++++++++++++++++++++++ Cargo.toml | 15 + src/db/database.rs | 122 +++ src/db/mod.rs | 10 + src/lib.rs | 13 + src/prelude/mod.rs | 2 + src/tests/mod.rs | 17 + src/tests/test_database.rs | 6 + src/types/err.rs | 67 ++ src/types/file.rs | 69 ++ src/types/mod.rs | 5 +- 11 files changed, 1933 insertions(+), 1 deletion(-) create mode 100644 src/db/database.rs create mode 100644 src/prelude/mod.rs create mode 100644 src/tests/test_database.rs create mode 100644 src/types/err.rs create mode 100644 src/types/file.rs diff --git a/Cargo.lock b/Cargo.lock index 7cad23ed..b61f431c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,1614 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "allocator-api2" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "backtrace" +version = "0.3.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +dependencies = [ + "serde", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" + +[[package]] +name = "cc" +version = "1.0.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74b6a57f98764a267ff415d50a25e6e166f3831a5071af4995296ea97d210490" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "cpufeatures" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +dependencies = [ + "libc", +] + +[[package]] +name = "crc" +version = "3.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + +[[package]] +name = "crossbeam-queue" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df0346b5d5e76ac2fe4e327c5fd1118d6be7c51dfb18f9b7922923f287471e35" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "der" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f55bf8e7b65898637379c1b74eb1551107c8294ed26d855ceb9fd1a09cfc9bc0" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", + "subtle", +] + +[[package]] +name = "dotenvy" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +dependencies = [ + "serde", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "etcetera" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943" +dependencies = [ + "cfg-if", + "home", + "windows-sys 0.48.0", +] + +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + +[[package]] +name = "fastrand" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" + +[[package]] +name = "flume" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181" +dependencies = [ + "futures-core", + "futures-sink", + "spin", +] + +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-executor" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-intrusive" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d930c203dd0b6ff06e0201a4a2fe9149b43c684fd4420555b26d21b1a02956f" +dependencies = [ + "futures-core", + "lock_api", + "parking_lot", +] + +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-sink" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" + +[[package]] +name = "futures-task" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-util" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +dependencies = [ + "futures-core", + "futures-io", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "gimli" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] + +[[package]] +name = "hashlink" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hkdf" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +dependencies = [ + "hmac", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "2.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +dependencies = [ + "spin", +] + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + +[[package]] +name = "libsqlite3-sys" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.48.0", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand", + "smallvec", + "zeroize", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "oasysdb" version = "0.7.0" +dependencies = [ + "bincode", + "serde", + "sqlx", + "tokio", + "url", + "uuid", +] + +[[package]] +name = "object" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "081b846d1d56ddfc18fdf1a922e4f6e07a11768ea1b92dec44e42b72712ccfce" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.5.2", + "smallvec", + "windows-targets 0.52.6", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "pin-project-lite" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der", + "pkcs8", + "spki", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + +[[package]] +name = "pkg-config" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "redox_syscall" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" +dependencies = [ + "bitflags 2.6.0", +] + +[[package]] +name = "rsa" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8", + "rand_core", + "signature", + "spki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags 2.6.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.203" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.203" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.68", +] + +[[package]] +name = "serde_json" +version = "1.0.120" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core", +] + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + +[[package]] +name = "sqlformat" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f895e3734318cc55f1fe66258926c9b910c124d47520339efecbb6c59cec7c1f" +dependencies = [ + "nom", + "unicode_categories", +] + +[[package]] +name = "sqlx" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9a2ccff1a000a5a59cd33da541d9f2fdcd9e6e8229cc200565942bff36d0aaa" +dependencies = [ + "sqlx-core", + "sqlx-macros", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", +] + +[[package]] +name = "sqlx-core" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24ba59a9342a3d9bab6c56c118be528b27c9b60e490080e9711a04dccac83ef6" +dependencies = [ + "ahash", + "atoi", + "byteorder", + "bytes", + "crc", + "crossbeam-queue", + "either", + "event-listener", + "futures-channel", + "futures-core", + "futures-intrusive", + "futures-io", + "futures-util", + "hashlink", + "hex", + "indexmap", + "log", + "memchr", + "once_cell", + "paste", + "percent-encoding", + "serde", + "serde_json", + "sha2", + "smallvec", + "sqlformat", + "thiserror", + "tokio", + "tokio-stream", + "tracing", + "url", +] + +[[package]] +name = "sqlx-macros" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ea40e2345eb2faa9e1e5e326db8c34711317d2b5e08d0d5741619048a803127" +dependencies = [ + "proc-macro2", + "quote", + "sqlx-core", + "sqlx-macros-core", + "syn 1.0.109", +] + +[[package]] +name = "sqlx-macros-core" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5833ef53aaa16d860e92123292f1f6a3d53c34ba8b1969f152ef1a7bb803f3c8" +dependencies = [ + "dotenvy", + "either", + "heck", + "hex", + "once_cell", + "proc-macro2", + "quote", + "serde", + "serde_json", + "sha2", + "sqlx-core", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", + "syn 1.0.109", + "tempfile", + "tokio", + "url", +] + +[[package]] +name = "sqlx-mysql" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ed31390216d20e538e447a7a9b959e06ed9fc51c37b514b46eb758016ecd418" +dependencies = [ + "atoi", + "base64", + "bitflags 2.6.0", + "byteorder", + "bytes", + "crc", + "digest", + "dotenvy", + "either", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "generic-array", + "hex", + "hkdf", + "hmac", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "percent-encoding", + "rand", + "rsa", + "serde", + "sha1", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-postgres" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c824eb80b894f926f89a0b9da0c7f435d27cdd35b8c655b114e58223918577e" +dependencies = [ + "atoi", + "base64", + "bitflags 2.6.0", + "byteorder", + "crc", + "dotenvy", + "etcetera", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "hex", + "hkdf", + "hmac", + "home", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "rand", + "serde", + "serde_json", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-sqlite" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b244ef0a8414da0bed4bb1910426e890b19e5e9bccc27ada6b797d05c55ae0aa" +dependencies = [ + "atoi", + "flume", + "futures-channel", + "futures-core", + "futures-executor", + "futures-intrusive", + "futures-util", + "libsqlite3-sys", + "log", + "percent-encoding", + "serde", + "sqlx-core", + "tracing", + "url", + "urlencoding", +] + +[[package]] +name = "stringprep" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", + "unicode-properties", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +dependencies = [ + "cfg-if", + "fastrand", + "rustix", + "windows-sys 0.52.0", +] + +[[package]] +name = "thiserror" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.68", +] + +[[package]] +name = "tinyvec" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6b6a2fb3a985e99cebfaefa9faa3024743da73304ca1c683a36429613d3d22" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "num_cpus", + "pin-project-lite", + "socket2", + "tokio-macros", + "windows-sys 0.48.0", +] + +[[package]] +name = "tokio-macros" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.68", +] + +[[package]] +name = "tokio-stream" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.68", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", +] + +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + +[[package]] +name = "unicode-bidi" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unicode-normalization" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-properties" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291" + +[[package]] +name = "unicode-segmentation" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + +[[package]] +name = "url" +version = "2.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "uuid" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439" +dependencies = [ + "getrandom", + "rand", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + +[[package]] +name = "whoami" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9" +dependencies = [ + "redox_syscall 0.4.1", + "wasite", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.68", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" diff --git a/Cargo.toml b/Cargo.toml index 08d123f3..bb16a79a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,21 @@ keywords = ["vector", "database", "anns", "search", "simd"] categories = ["database", "algorithms", "data-structures"] [dependencies] +uuid = { version = "1.9.1", features = ["v4", "fast-rng"] } +url = "2.5.2" + +# Serialization. +serde = { version = "1.0.203", features = ["derive"] } +bincode = "1.3.3" + +[dependencies.sqlx] +version = "0.7.4" +default-features = false +features = ["all-databases", "runtime-tokio", "json"] + +[dependencies.tokio] +version = "1.38.0" +features = ["rt-multi-thread", "macros"] [profile.release] lto = true diff --git a/src/db/database.rs b/src/db/database.rs new file mode 100644 index 00000000..1a48db45 --- /dev/null +++ b/src/db/database.rs @@ -0,0 +1,122 @@ +use super::*; +use sqlx::any::install_default_drivers; +use sqlx::{AnyConnection as SourceConnection, Connection}; +use tokio::runtime::Runtime; +use url::Url; + +type DatabaseURL = String; +type IndexName = String; +type IndexFile = PathBuf; + +/// The vector database interface. +pub struct Database { + root: PathBuf, + state: DatabaseState, + conn: SourceConnection, +} + +impl Database { + /// Opens existing or creates a new vector database. + /// - `root`: Root directory of the database. + /// - `source_url`: URL to SQL database. + /// + /// This will attempt to restore the database state from the file first. + /// If the file does not exist, it will create a new database. + /// When creating a new database, a data source is required. + /// + /// Source URL examples: + /// ```txt + /// sqlite://sqlite.db + /// mysql://user:password@localhost:3306/db + /// postgresql://user:password@localhost:5432/db + /// ``` + pub fn open( + root: impl Into, + source_url: Option>, + ) -> Result { + let root_dir: PathBuf = root.into(); + if !root_dir.try_exists()? { + fs::create_dir_all(&root_dir)?; + } + + let state_file = root_dir.join("odbstate"); + let state = if state_file.try_exists()? { + file::read_binary_file(state_file)? + } else { + let source = source_url.ok_or_else(|| { + let code = ErrorCode::MissingSource; + let message = "Data source is required to create a database."; + Error::new(code, message) + })?; + + let source: String = source.into(); + DatabaseState::validate_source(&source)?; + + let state = DatabaseState { source, indices: HashMap::new() }; + file::write_binary_file(state_file, &state)?; + state + }; + + let conn: SourceConnection = state.connect()?; + Ok(Self { root: root_dir, state, conn }) + } + + /// Returns the state object of the database. + pub fn state(&self) -> DatabaseState { + self.state.clone() + } +} + +/// The state of the vector database. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DatabaseState { + source: DatabaseURL, + indices: HashMap, +} + +impl DatabaseState { + /// Connects to the source SQL database asynchronously. + pub async fn async_connect(&self) -> Result { + install_default_drivers(); + Ok(SourceConnection::connect(&self.source).await?) + } + + /// Connects to the source SQL database. + pub fn connect(&self) -> Result { + Runtime::new()?.block_on(self.async_connect()) + } + + /// Returns the type of the source database. + /// - sqlite + /// - mysql + /// - postgresql + pub fn source_type(&self) -> String { + // We can safely unwrap here because + // we have already validated the source URL. + let url = self.source.parse::().unwrap(); + url.scheme().to_owned() + } + + /// Validates the data source URL. + pub fn validate_source(url: impl Into) -> Result<(), Error> { + let url: String = url.into(); + let url = url.parse::().map_err(|_| { + let code = ErrorCode::InvalidSource; + let message = "Invalid database source URL."; + Error::new(code, message) + })?; + + let valid_schemes = ["sqlite", "mysql", "postgresql"]; + if !valid_schemes.contains(&url.scheme()) { + let code = ErrorCode::InvalidSource; + let message = format!( + "Unsupported database scheme. Choose between: {}.", + valid_schemes.join(", ") + ); + + return Err(Error::new(code, message)); + } + + Ok(()) + } +} diff --git a/src/db/mod.rs b/src/db/mod.rs index 8b137891..0ee6f5b2 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1 +1,11 @@ +use crate::types::err::{Error, ErrorCode}; +use crate::types::file; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::fs; +use std::path::PathBuf; +mod database; + +// Re-export types for public API below. +pub use database::*; diff --git a/src/lib.rs b/src/lib.rs index 87c27719..b9eabb4d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,15 @@ +#![warn(missing_docs)] +#![warn(unused_qualifications)] +#![doc = include_str!("../readme.md")] +#![doc(html_favicon_url = "https://i.postimg.cc/W3T230zk/favicon.png")] +#![doc(html_logo_url = "https://i.postimg.cc/Vv0HPVwB/logo.png")] + #[cfg(test)] mod tests; + +/// Primary module for vector database operations. +pub mod db; +/// Convenience re-exports of the public APIs. +pub mod prelude; +/// Database utility types and functions. +pub mod types; diff --git a/src/prelude/mod.rs b/src/prelude/mod.rs new file mode 100644 index 00000000..beb6c198 --- /dev/null +++ b/src/prelude/mod.rs @@ -0,0 +1,2 @@ +pub use crate::db::*; +pub use crate::types::err::*; diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 8b137891..5056dc21 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1 +1,18 @@ +use crate::prelude::*; +use crate::types::file; +use std::path::PathBuf; +mod test_database; + +fn create_test_database() -> Result { + let path = PathBuf::from("odb_data"); + let source_url = { + let db_path = file::get_tmp_dir()?.join("sqlite.db"); + Some(format!("sqlite://{}?mode=rwc", db_path.display())) + }; + + let db = Database::open(path, source_url)?; + let state = db.state(); + assert_eq!(state.source_type(), "sqlite"); + Ok(db) +} diff --git a/src/tests/test_database.rs b/src/tests/test_database.rs new file mode 100644 index 00000000..42c9a5ea --- /dev/null +++ b/src/tests/test_database.rs @@ -0,0 +1,6 @@ +use super::*; + +#[test] +fn test_database_open() { + assert!(create_test_database().is_ok()); +} diff --git a/src/types/err.rs b/src/types/err.rs new file mode 100644 index 00000000..2db7581d --- /dev/null +++ b/src/types/err.rs @@ -0,0 +1,67 @@ +use std::fmt::{Display, Formatter, Result}; + +// External error types. +use bincode::Error as BincodeError; +use sqlx::Error as SQLError; +use std::error::Error as StandardError; +use std::io::Error as IOError; + +#[allow(missing_docs)] +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum ErrorCode { + // Native error types. + InvalidSource, + MissingSource, + + // External error types. + FileError, + SerializationError, + SQLError, +} + +/// The native error type for OasysDB operations. +#[derive(Debug)] +pub struct Error { + /// Represents cause or source of the error. + pub code: ErrorCode, + /// Details about the error and why it occurred. + pub message: String, +} + +impl Error { + /// Creates a new error instance. + pub fn new(code: ErrorCode, message: impl Into) -> Self { + Self { code, message: message.into() } + } +} + +impl Display for Error { + fn fmt(&self, f: &mut Formatter) -> Result { + write!(f, "{:?}: {}", self.code, self.message) + } +} + +// Implement interoperability with other error types. + +impl StandardError for Error {} + +impl From for Error { + fn from(err: IOError) -> Self { + let code = ErrorCode::FileError; + Error::new(code, err.to_string()) + } +} + +impl From for Error { + fn from(err: BincodeError) -> Self { + let code = ErrorCode::SerializationError; + Error::new(code, err.to_string()) + } +} + +impl From for Error { + fn from(err: SQLError) -> Self { + let code = ErrorCode::SQLError; + Error::new(code, err.to_string()) + } +} diff --git a/src/types/file.rs b/src/types/file.rs new file mode 100644 index 00000000..82016e67 --- /dev/null +++ b/src/types/file.rs @@ -0,0 +1,69 @@ +use super::err::{Error, ErrorCode}; +use serde::de::DeserializeOwned; +use serde::Serialize; +use std::env; +use std::fs::{self, OpenOptions}; +use std::io::{BufReader, BufWriter}; +use std::path::{Path, PathBuf}; + +/// Reads a binary file and deserializes its contents to a type. +/// - `path`: Path to the binary file. +pub fn read_binary_file( + path: impl AsRef, +) -> Result { + let file = OpenOptions::new().read(true).open(path)?; + let reader = BufReader::new(file); + let value = bincode::deserialize_from(reader)?; + Ok(value) +} + +/// Serializes the data and writes it to a binary file. +/// - `path`: Path to the binary file. +/// - `data`: Data to write. +pub fn write_binary_file( + path: impl AsRef, + data: &T, +) -> Result<(), Error> { + let file_name = parse_file_name(&path)?; + let tmp_dir = get_tmp_dir()?; + + // To ensure that the target file is not corrupted if + // the operation is interrupted or fails: + // 1. Write the data to a temporary file. + // 2. Rename the temporary file to the target file. + + let tmp_file = tmp_dir.join(file_name); + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(&tmp_file)?; + + let writer = BufWriter::new(file); + bincode::serialize_into(writer, data)?; + + fs::rename(&tmp_file, &path)?; + Ok(()) +} + +/// Returns the temporary directory path for OasysDB. +pub fn get_tmp_dir() -> Result { + let tmp_dir = env::temp_dir().join("oasysdb"); + if !tmp_dir.try_exists()? { + fs::create_dir_all(&tmp_dir)?; + } + + Ok(tmp_dir) +} + +/// Parses the file name from a path. +/// - `path`: Path to a file. +pub fn parse_file_name(path: impl AsRef) -> Result { + let file_name = path.as_ref().file_name().ok_or_else(|| { + let code = ErrorCode::FileError; + let message = "Unable to parse the file name from the path."; + Error::new(code, message) + })?; + + Ok(file_name.to_string_lossy().to_string()) +} diff --git a/src/types/mod.rs b/src/types/mod.rs index 8b137891..35ac3c65 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1 +1,4 @@ - +/// Custom error types of OasysDB. +pub mod err; +/// File operation utilities. +pub mod file; From cad65cd7c9e16e89881d70d56deb9cfed9e327be Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Tue, 9 Jul 2024 16:46:46 -0500 Subject: [PATCH 41/88] feat: add initial index building and types --- Cargo.lock | 19 ++++++ Cargo.toml | 3 +- src/db/database.rs | 31 ++++++++-- src/db/mod.rs | 1 + src/indices/ix_bruteforce.rs | 8 +++ src/indices/mod.rs | 108 +++++++++++++++++++++++++++++++++++ src/lib.rs | 2 + src/prelude/mod.rs | 5 +- src/types/err.rs | 4 +- src/types/mod.rs | 4 +- src/types/record.rs | 42 ++++++++++++++ 11 files changed, 215 insertions(+), 12 deletions(-) create mode 100644 src/indices/ix_bruteforce.rs create mode 100644 src/types/record.rs diff --git a/Cargo.lock b/Cargo.lock index b61f431c..f5a6d329 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -180,6 +180,12 @@ version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "crypto-common" version = "0.1.6" @@ -386,6 +392,17 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", + "serde", +] + [[package]] name = "hashbrown" version = "0.14.5" @@ -647,6 +664,7 @@ name = "oasysdb" version = "0.7.0" dependencies = [ "bincode", + "half", "serde", "sqlx", "tokio", @@ -1413,6 +1431,7 @@ checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439" dependencies = [ "getrandom", "rand", + "serde", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index bb16a79a..f37b699c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,8 @@ keywords = ["vector", "database", "anns", "search", "simd"] categories = ["database", "algorithms", "data-structures"] [dependencies] -uuid = { version = "1.9.1", features = ["v4", "fast-rng"] } +uuid = { version = "1.9.1", features = ["v4", "fast-rng", "serde"] } +half = { version = "2.4.1", features = ["serde"] } url = "2.5.2" # Serialization. diff --git a/src/db/database.rs b/src/db/database.rs index 1a48db45..ac37ed8e 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -35,8 +35,9 @@ impl Database { source_url: Option>, ) -> Result { let root_dir: PathBuf = root.into(); - if !root_dir.try_exists()? { - fs::create_dir_all(&root_dir)?; + let indices_dir = root_dir.join("indices"); + if !indices_dir.try_exists()? { + fs::create_dir_all(&indices_dir)?; } let state_file = root_dir.join("odbstate"); @@ -61,17 +62,28 @@ impl Database { Ok(Self { root: root_dir, state, conn }) } + /// Creates a new index in the database. + /// - `name`: Name of the index. + /// - `config`: Index data source configuration. + pub fn create_index( + &mut self, + name: impl Into, + config: SourceConfig, + ) -> Result<(), Error> { + unimplemented!() + } + /// Returns the state object of the database. - pub fn state(&self) -> DatabaseState { - self.state.clone() + pub fn state(&self) -> &DatabaseState { + &self.state } } /// The state of the vector database. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct DatabaseState { source: DatabaseURL, - indices: HashMap, + indices: HashMap, } impl DatabaseState { @@ -120,3 +132,10 @@ impl DatabaseState { Ok(()) } } + +/// Details about the index and where it is stored. +#[derive(Debug, Serialize, Deserialize)] +pub struct IndexRef { + algorithm: IndexAlgorithm, + file: IndexFile, +} diff --git a/src/db/mod.rs b/src/db/mod.rs index 0ee6f5b2..6c5c98b6 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,3 +1,4 @@ +use crate::indices::*; use crate::types::err::{Error, ErrorCode}; use crate::types::file; use serde::{Deserialize, Serialize}; diff --git a/src/indices/ix_bruteforce.rs b/src/indices/ix_bruteforce.rs new file mode 100644 index 00000000..56fc6157 --- /dev/null +++ b/src/indices/ix_bruteforce.rs @@ -0,0 +1,8 @@ +use super::*; + +#[derive(Debug, Serialize, Deserialize)] +pub struct IndexBruteForce { + config: SourceConfig, + data: HashMap, + hidden: Vec, +} diff --git a/src/indices/mod.rs b/src/indices/mod.rs index 8b137891..4cc9a1b3 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -1 +1,109 @@ +use crate::types::err::Error; +use crate::types::record::*; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::fmt::Debug; +mod ix_bruteforce; + +pub use ix_bruteforce::IndexBruteForce; + +type TableName = String; + +/// Data source configuration for a vector index. +#[derive(Debug, Serialize, Deserialize)] +pub struct SourceConfig { + /// Name of the SQL table to use as data source. + pub table: TableName, + /// Column name of the primary key in the data source. + pub primary_key: ColumnName, + /// Column name storing the vector data. + pub vector: ColumnName, + /// Optional list of column names storing additional metadata. + pub metadata: Option>, +} + +impl SourceConfig { + /// Creates a source configuration with mostly default values. + /// - `primary_key`: Column name of the primary key in the data source. + /// - `vector`: Column name storing the vector data. + /// + /// Default configuration: + /// - No metadata columns. + pub fn new( + table: impl Into, + primary_key: impl Into, + vector: impl Into, + ) -> Self { + SourceConfig { + table: table.into(), + primary_key: primary_key.into(), + vector: vector.into(), + metadata: None, + } + } + + /// Adds a list of metadata columns to the source configuration. + /// - `metadata`: List of metadata column names. + /// + /// OasysDB only supports primitive data types for metadata columns such as: + /// - String + /// - Integer + /// - Float + /// - Boolean + pub fn with_metadata(mut self, metadata: Vec>) -> Self { + self.metadata = Some(metadata.into_iter().map(|s| s.into()).collect()); + self + } +} + +/// Algorithm options used to index and search vectors. +#[allow(missing_docs)] +#[derive(Debug, PartialEq, Eq)] +#[derive(Serialize, Deserialize)] +pub enum IndexAlgorithm { + BruteForce, +} + +/// Trait for vector index implementations. +/// +/// For each index algorithm, a separate struct and implementation +/// of this trait is required. Also, these are some fields that +/// should be included in the Index struct: +/// +/// ```text +/// struct Index{{ Algorithm }} { +/// config: SourceConfig, +/// data: HashMap, +/// hidden: Vec, +/// // Other fields... +/// } +/// ``` +pub trait VectorIndex: Debug + Serialize + DeserializeOwned { + /// Returns the configuration of the index. + fn config(&self) -> &SourceConfig; + + /// Returns the record IDs hidden from the result. + fn hidden(&self) -> &[RecordID]; + + /// Initializes the index. + fn new(source: SourceConfig) -> Self; + + /// Trains the index based on the new records. + /// + /// If the index has been trained and not empty, this method + /// will incrementally train the index based on the current fitting. + /// Otherwise, this method will train the index from scratch like normal. + fn fit(&mut self, records: HashMap) -> Result<(), Error>; + + /// Resets the index and re-trains it on the non-hidden records. + /// + /// Incremental fitting is not as optimal as fitting from scratch for + /// some indexing algorithms. This method could be useful to re-balance + /// the index after a certain threshold of incremental fitting. + fn refit(&mut self) -> Result<(), Error>; + + /// Hides certain records from the search result permanently. + fn hide(&mut self, record_ids: Vec) -> Result<(), Error>; +} diff --git a/src/lib.rs b/src/lib.rs index b9eabb4d..f61f8a26 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,8 @@ mod tests; /// Primary module for vector database operations. pub mod db; +/// Module for managing database indices and related types. +pub mod indices; /// Convenience re-exports of the public APIs. pub mod prelude; /// Database utility types and functions. diff --git a/src/prelude/mod.rs b/src/prelude/mod.rs index beb6c198..5351c872 100644 --- a/src/prelude/mod.rs +++ b/src/prelude/mod.rs @@ -1,2 +1,3 @@ -pub use crate::db::*; -pub use crate::types::err::*; +pub use crate::db::Database; +pub use crate::indices::SourceConfig; +pub use crate::types::err::{Error, ErrorCode}; diff --git a/src/types/err.rs b/src/types/err.rs index 2db7581d..322e35bb 100644 --- a/src/types/err.rs +++ b/src/types/err.rs @@ -7,9 +7,9 @@ use std::error::Error as StandardError; use std::io::Error as IOError; #[allow(missing_docs)] -#[derive(Debug, Copy, Clone, Eq, PartialEq)] +#[derive(Debug, Eq, PartialEq)] pub enum ErrorCode { - // Native error types. + // Data source related. InvalidSource, MissingSource, diff --git a/src/types/mod.rs b/src/types/mod.rs index 35ac3c65..47b3284f 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,4 +1,6 @@ /// Custom error types of OasysDB. pub mod err; /// File operation utilities. -pub mod file; +pub(crate) mod file; +/// Vector record types for indices. +pub(crate) mod record; diff --git a/src/types/record.rs b/src/types/record.rs new file mode 100644 index 00000000..3b406913 --- /dev/null +++ b/src/types/record.rs @@ -0,0 +1,42 @@ +use half::f16; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use uuid::Uuid; + +/// Column name of the SQL data source table. +pub type ColumnName = String; + +/// ID type for records in the index from the data source. +#[allow(clippy::upper_case_acronyms)] +#[derive(Debug, Serialize, Deserialize)] +#[derive(PartialEq, Eq, Hash)] +pub enum RecordID { + /// Auto-incrementing integer ID (Most efficient). + Integer(usize), + /// String as ID (Not efficient). + String(String), + /// Universally Unique ID (Less efficient). + UUID(Uuid), +} + +/// Record type stored in the index based on the +/// configuration and data source. +#[derive(Debug, Serialize, Deserialize)] +pub struct Record { + vector: Vector, + data: Option>, +} + +#[derive(Debug, Serialize, Deserialize)] +/// Vector data type stored in the index. +pub struct Vector(Vec); + +/// Data types supported as metadata in the index. +#[allow(missing_docs)] +#[derive(Debug, Serialize, Deserialize)] +pub enum RecordData { + Boolean(bool), + Float(f32), + Integer(usize), + String(String), +} From e7c8544f5c41ae2c24af606887dd6b0e17959a4a Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Thu, 11 Jul 2024 20:09:15 -0500 Subject: [PATCH 42/88] feat: add initial brute force index impl --- Cargo.lock | 220 ++++++++++---------------- Cargo.toml | 12 +- src/db/database.rs | 89 ++++++++++- src/db/mod.rs | 5 +- src/indices/ix_bruteforce.rs | 129 ++++++++++++++- src/indices/mod.rs | 287 +++++++++++++++++++++++++++++++--- src/indices/type_algorithm.rs | 53 +++++++ src/indices/type_distance.rs | 49 ++++++ src/indices/type_filter.rs | 237 ++++++++++++++++++++++++++++ src/lib.rs | 3 - src/prelude/mod.rs | 5 +- src/tests/mod.rs | 18 --- src/tests/test_database.rs | 6 - src/types/conn.rs | 2 + src/types/err.rs | 16 ++ src/types/mod.rs | 7 +- src/types/record.rs | 188 ++++++++++++++++++++-- 17 files changed, 1102 insertions(+), 224 deletions(-) create mode 100644 src/indices/type_algorithm.rs create mode 100644 src/indices/type_distance.rs create mode 100644 src/indices/type_filter.rs delete mode 100644 src/tests/mod.rs delete mode 100644 src/tests/test_database.rs create mode 100644 src/types/conn.rs diff --git a/Cargo.lock b/Cargo.lock index f5a6d329..c43c6fb6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,21 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "addr2line" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - [[package]] name = "ahash" version = "0.8.11" @@ -51,21 +36,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" -[[package]] -name = "backtrace" -version = "0.3.73" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" -dependencies = [ - "addr2line", - "cc", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - [[package]] name = "base64" version = "0.21.7" @@ -165,6 +135,25 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-queue" version = "0.3.11" @@ -293,6 +282,21 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futures" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + [[package]] name = "futures-channel" version = "0.3.30" @@ -337,6 +341,17 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +[[package]] +name = "futures-macro" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.68", +] + [[package]] name = "futures-sink" version = "0.3.30" @@ -355,8 +370,10 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ + "futures-channel", "futures-core", "futures-io", + "futures-macro", "futures-sink", "futures-task", "memchr", @@ -386,12 +403,6 @@ dependencies = [ "wasi", ] -[[package]] -name = "gimli" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" - [[package]] name = "half" version = "2.4.1" @@ -431,12 +442,6 @@ dependencies = [ "unicode-segmentation", ] -[[package]] -name = "hermit-abi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" - [[package]] name = "hex" version = "0.4.3" @@ -572,26 +577,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" -[[package]] -name = "miniz_oxide" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" -dependencies = [ - "adler", -] - -[[package]] -name = "mio" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" -dependencies = [ - "libc", - "wasi", - "windows-sys 0.48.0", -] - [[package]] name = "nom" version = "7.1.3" @@ -649,38 +634,22 @@ dependencies = [ "libm", ] -[[package]] -name = "num_cpus" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "oasysdb" version = "0.7.0" dependencies = [ "bincode", + "futures", "half", + "rayon", "serde", + "serde_json", + "simsimd", "sqlx", - "tokio", "url", "uuid", ] -[[package]] -name = "object" -version = "0.36.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "081b846d1d56ddfc18fdf1a922e4f6e07a11768ea1b92dec44e42b72712ccfce" -dependencies = [ - "memchr", -] - [[package]] name = "once_cell" version = "1.19.0" @@ -824,6 +793,26 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.4.1" @@ -862,12 +851,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rustc-demangle" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" - [[package]] name = "rustix" version = "0.38.34" @@ -956,6 +939,15 @@ dependencies = [ "rand_core", ] +[[package]] +name = "simsimd" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efc843bc8f12d9c8e6b734a0fe8918fc497b42f6ae0f347dbfdad5b5138ab9b4" +dependencies = [ + "cc", +] + [[package]] name = "slab" version = "0.4.9" @@ -971,16 +963,6 @@ version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" -[[package]] -name = "socket2" -version = "0.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - [[package]] name = "spin" version = "0.9.8" @@ -1056,8 +1038,6 @@ dependencies = [ "smallvec", "sqlformat", "thiserror", - "tokio", - "tokio-stream", "tracing", "url", ] @@ -1097,7 +1077,6 @@ dependencies = [ "sqlx-sqlite", "syn 1.0.109", "tempfile", - "tokio", "url", ] @@ -1290,45 +1269,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "tokio" -version = "1.38.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" -dependencies = [ - "backtrace", - "bytes", - "libc", - "mio", - "num_cpus", - "pin-project-lite", - "socket2", - "tokio-macros", - "windows-sys 0.48.0", -] - -[[package]] -name = "tokio-macros" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.68", -] - -[[package]] -name = "tokio-stream" -version = "0.1.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", -] - [[package]] name = "tracing" version = "0.1.40" diff --git a/Cargo.toml b/Cargo.toml index f37b699c..b53873f9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,19 +19,21 @@ categories = ["database", "algorithms", "data-structures"] uuid = { version = "1.9.1", features = ["v4", "fast-rng", "serde"] } half = { version = "2.4.1", features = ["serde"] } url = "2.5.2" +futures = "0.3.30" + +# Parallelism. +rayon = "1.10.0" +simsimd = "4.4.0" # Serialization. serde = { version = "1.0.203", features = ["derive"] } bincode = "1.3.3" +serde_json = "1.0.120" [dependencies.sqlx] version = "0.7.4" default-features = false -features = ["all-databases", "runtime-tokio", "json"] - -[dependencies.tokio] -version = "1.38.0" -features = ["rt-multi-thread", "macros"] +features = ["all-databases"] [profile.release] lto = true diff --git a/src/db/database.rs b/src/db/database.rs index ac37ed8e..6d7681df 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -1,8 +1,10 @@ use super::*; +use futures::executor; +use futures::stream::StreamExt; use sqlx::any::install_default_drivers; -use sqlx::{AnyConnection as SourceConnection, Connection}; -use tokio::runtime::Runtime; +use sqlx::Acquire; use url::Url; +use uuid::Uuid; type DatabaseURL = String; type IndexName = String; @@ -62,21 +64,69 @@ impl Database { Ok(Self { root: root_dir, state, conn }) } - /// Creates a new index in the database. + /// Creates a new index in the database asynchronously. /// - `name`: Name of the index. /// - `config`: Index data source configuration. - pub fn create_index( + pub async fn async_create_index( &mut self, name: impl Into, + algorithm: IndexAlgorithm, + metric: DistanceMetric, config: SourceConfig, ) -> Result<(), Error> { - unimplemented!() + let state_file = self.state_file(); + + // Create a new file where the index will be stored. + let index_file = { + let uuid = Uuid::new_v4().to_string(); + self.indices_dir().join(uuid) + }; + + let query = config.to_query(); + let conn = self.conn.acquire().await?; + let mut stream = sqlx::query(&query).fetch(conn); + + let mut records = HashMap::new(); + while let Some(row) = stream.next().await { + let row = row?; + let (id, record) = config.to_record(&row)?; + records.insert(id, record); + } + + let mut index = algorithm.initialize(config, metric); + index.fit(records)?; + + // Persist the index to the file. + algorithm.persist_index(&index_file, index)?; + + // Update db state with the new index. + let index_ref = IndexRef { algorithm, file: index_file.clone() }; + self.state.indices.insert(name.into(), index_ref); + file::write_binary_file(&state_file, &self.state)?; + + Ok(()) } /// Returns the state object of the database. pub fn state(&self) -> &DatabaseState { &self.state } + + /// Persists the state of the database to the state file. + pub fn persist_state(&self) -> Result<(), Error> { + file::write_binary_file(self.state_file(), &self.state) + } +} + +// Write internal database methods here. +impl Database { + fn state_file(&self) -> PathBuf { + self.root.join("odbstate") + } + + fn indices_dir(&self) -> PathBuf { + self.root.join("indices") + } } /// The state of the vector database. @@ -95,18 +145,18 @@ impl DatabaseState { /// Connects to the source SQL database. pub fn connect(&self) -> Result { - Runtime::new()?.block_on(self.async_connect()) + executor::block_on(self.async_connect()) } /// Returns the type of the source database. /// - sqlite /// - mysql /// - postgresql - pub fn source_type(&self) -> String { + pub fn source_type(&self) -> SourceType { // We can safely unwrap here because // we have already validated the source URL. let url = self.source.parse::().unwrap(); - url.scheme().to_owned() + url.scheme().into() } /// Validates the data source URL. @@ -139,3 +189,26 @@ pub struct IndexRef { algorithm: IndexAlgorithm, file: IndexFile, } + +#[cfg(test)] +mod tests { + use super::*; + + fn create_test_database() -> Result { + let path = PathBuf::from("odb_data"); + let source_url = { + let db_path = file::get_tmp_dir()?.join("sqlite.db"); + Some(format!("sqlite://{}?mode=rwc", db_path.display())) + }; + + let db = Database::open(path, source_url)?; + let state = db.state(); + assert_eq!(state.source_type(), SourceType::SQLITE); + Ok(db) + } + + #[test] + fn test_database_open() { + assert!(create_test_database().is_ok()); + } +} diff --git a/src/db/mod.rs b/src/db/mod.rs index 6c5c98b6..059b34d5 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,5 +1,6 @@ use crate::indices::*; -use crate::types::err::{Error, ErrorCode}; +use crate::types::conn::*; +use crate::types::err::*; use crate::types::file; use serde::{Deserialize, Serialize}; use std::collections::HashMap; @@ -9,4 +10,4 @@ use std::path::PathBuf; mod database; // Re-export types for public API below. -pub use database::*; +pub use database::Database; diff --git a/src/indices/ix_bruteforce.rs b/src/indices/ix_bruteforce.rs index 56fc6157..5553f205 100644 --- a/src/indices/ix_bruteforce.rs +++ b/src/indices/ix_bruteforce.rs @@ -1,8 +1,135 @@ use super::*; +use std::collections::BinaryHeap; +/// Brute force index implementation. +/// +/// This index stores all records in memory and performs a linear search +/// for the nearest neighbors. It is great for small datasets of less than +/// 10,000 records due to perfect recall and precision. #[derive(Debug, Serialize, Deserialize)] pub struct IndexBruteForce { config: SourceConfig, + metric: DistanceMetric, + metadata: IndexMetadata, data: HashMap, - hidden: Vec, +} + +impl IndexOps for IndexBruteForce { + fn new(config: SourceConfig, metric: DistanceMetric) -> Self { + Self { + config, + metric, + metadata: IndexMetadata::default(), + data: HashMap::new(), + } + } + + fn config(&self) -> &SourceConfig { + &self.config + } + + fn metric(&self) -> &DistanceMetric { + &self.metric + } + + fn metadata(&self) -> &IndexMetadata { + &self.metadata + } +} + +impl VectorIndex for IndexBruteForce { + fn fit(&mut self, records: HashMap) -> Result<(), Error> { + if records.is_empty() { + return Ok(()); + } + + self.metadata.last_inserted = records.keys().max().copied(); + self.metadata.count += records.len(); + self.data.par_extend(records); + + Ok(()) + } + + /// Refitting doesn't do anything for the brute force index + /// as incremental insertion or deletion will directly update + /// the data store accordingly guaranteeing the index optimal state. + fn refit(&mut self) -> Result<(), Error> { + Ok(()) + } + + /// Removes records from the index data store. + /// - `record_ids`: List of record IDs to remove from the index. + fn hide(&mut self, record_ids: Vec) -> Result<(), Error> { + if self.data.len() < record_ids.len() { + return Ok(()); + } + + self.data.retain(|id, _| !record_ids.contains(id)); + self.metadata.count = self.data.len(); + Ok(()) + } + + fn search( + &self, + query: Vector, + k: usize, + ) -> Result, Error> { + let mut results = BinaryHeap::new(); + for (id, record) in &self.data { + let distance = self.metric.distance(&record.vector, &query); + let data = record.data.clone(); + results.push(SearchResult { id: *id, distance, data }); + + if results.len() > k { + results.pop(); + } + } + + Ok(results.into_sorted_vec()) + } + + fn search_with_filters( + &self, + query: Vector, + k: usize, + filters: Filters, + ) -> Result, Error> { + if filters == Filters::NONE { + return self.search(query, k); + } + + let mut results = BinaryHeap::new(); + for (id, record) in &self.data { + if filters.apply(&record.data) { + let distance = self.metric.distance(&record.vector, &query); + let data = record.data.clone(); + results.push(SearchResult { id: *id, distance, data }); + + if results.len() > k { + results.pop(); + } + } + } + + Ok(results.into_sorted_vec()) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bruteforce_index() { + let config = SourceConfig::default(); + let metric = DistanceMetric::Euclidean; + let mut index = IndexBruteForce::new(config, metric); + index_tests::populate_index(&mut index); + index_tests::test_search(&index); + index_tests::test_search_with_filters(&index); + } } diff --git a/src/indices/mod.rs b/src/indices/mod.rs index 4cc9a1b3..9e6c50f7 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -1,18 +1,49 @@ -use crate::types::err::Error; +use crate::types::err::*; +use crate::types::file; use crate::types::record::*; +use rayon::prelude::*; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; +use sqlx::any::AnyRow; +use std::any::Any; use std::collections::HashMap; use std::fmt::Debug; +use std::path::Path; mod ix_bruteforce; +mod type_algorithm; +mod type_distance; +mod type_filter; pub use ix_bruteforce::IndexBruteForce; +pub use type_algorithm::IndexAlgorithm; +pub use type_distance::DistanceMetric; +pub use type_filter::*; type TableName = String; +/// Type of SQL database used as a data source. +#[allow(missing_docs)] +#[derive(Debug, PartialEq, Eq)] +pub enum SourceType { + SQLITE, + POSTGRES, + MYSQL, +} + +impl From<&str> for SourceType { + fn from(value: &str) -> Self { + match value { + "sqlite" => SourceType::SQLITE, + "postgres" | "postgresql" => SourceType::POSTGRES, + "mysql" => SourceType::MYSQL, + _ => panic!("Unsupported database scheme: {value}."), + } + } +} + /// Data source configuration for a vector index. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize, Default)] pub struct SourceConfig { /// Name of the SQL table to use as data source. pub table: TableName, @@ -22,6 +53,8 @@ pub struct SourceConfig { pub vector: ColumnName, /// Optional list of column names storing additional metadata. pub metadata: Option>, + /// Filter to apply to the SQL query using WHERE clause. + pub filter: Option, } impl SourceConfig { @@ -31,6 +64,7 @@ impl SourceConfig { /// /// Default configuration: /// - No metadata columns. + /// - No query filter. pub fn new( table: impl Into, primary_key: impl Into, @@ -41,6 +75,7 @@ impl SourceConfig { primary_key: primary_key.into(), vector: vector.into(), metadata: None, + filter: None, } } @@ -56,40 +91,158 @@ impl SourceConfig { self.metadata = Some(metadata.into_iter().map(|s| s.into()).collect()); self } + + /// Adds a filter to the source configuration. + /// - `filter`: Filter string without the WHERE keyword. + /// + /// Example: + /// ```text + /// year > 2000 AND genre = 'action' + /// ``` + pub fn with_filter(mut self, filter: impl Into) -> Self { + let filter: String = filter.into(); + self.filter = Some(filter.trim().to_string()); + self + } + + /// Returns the list of columns in the source configuration. + pub fn columns(&self) -> Vec { + let mut columns = vec![&self.primary_key, &self.vector]; + if let Some(metadata) = &self.metadata { + columns.extend(metadata.iter()); + } + + columns.into_iter().map(|s| s.to_string()).collect() + } + + /// Generates a SQL query string based on the configuration. + /// + /// Example: + /// ```sql + /// SELECT id, vector, metadata + /// FROM vectors + /// WHERE metadata > 2000 + /// ``` + pub(crate) fn to_query(&self) -> String { + let table = &self.table; + let columns = self.columns().join(", "); + let filter = match &self.filter { + Some(filter) => format!("WHERE {}", filter), + None => String::new(), + }; + + let query = format!("SELECT {columns} FROM {table} {filter}"); + query.trim().to_string() + } + + /// Creates a tuple of record ID and record data from a row. + pub(crate) fn to_record( + &self, + row: &AnyRow, + ) -> Result<(RecordID, Record), Error> { + let id = RecordID::from_row(&self.primary_key, row)?; + let vector = Vector::from_row(&self.vector, row)?; + + let mut metadata = HashMap::new(); + if let Some(metadata_columns) = &self.metadata { + for column in metadata_columns { + let value = RowOps::from_row(column, row)?; + metadata.insert(column.to_owned(), value); + } + } + + let record = Record { vector, data: metadata }; + Ok((id, record)) + } } -/// Algorithm options used to index and search vectors. -#[allow(missing_docs)] -#[derive(Debug, PartialEq, Eq)] -#[derive(Serialize, Deserialize)] -pub enum IndexAlgorithm { - BruteForce, +/// Metadata about the index for operations and optimizations. +#[derive(Debug, Serialize, Deserialize, Default)] +pub struct IndexMetadata { + /// Hidden records that will not be included in search results. + pub hidden: Vec, + /// Last inserted data reference used for incremental insertion. + pub last_inserted: Option, + /// Number of records in the index. + pub count: usize, +} + +/// Nearest neighbor search result. +#[derive(Debug)] +pub struct SearchResult { + /// ID of the record in the data source. + pub id: RecordID, + /// Record metadata. + pub data: HashMap>, + /// Distance between the query and the record. + pub distance: f32, +} + +impl PartialEq for SearchResult { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl Eq for SearchResult {} + +impl PartialOrd for SearchResult { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for SearchResult { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.distance.partial_cmp(&other.distance).unwrap() + } +} + +/// Trait for initializing a new index implementation. +/// +/// This will be used by the IndexAlgorithm enum to initialize a new index +/// based on the algorithm and configuration. In addition to this trait, +/// the index struct should implement the VectorIndex trait. +pub trait IndexOps: Debug + Serialize + DeserializeOwned { + /// Initializes an empty index with the given configuration. + fn new(config: SourceConfig, metric: DistanceMetric) -> Self; + + /// Reads and deserializes the index from a file. + fn load(path: impl AsRef) -> Result { + file::read_binary_file(path) + } + + /// Serializes and persists the index to a file. + fn persist(&self, path: impl AsRef) -> Result<(), Error> { + file::write_binary_file(path, self) + } + + /// Returns the configuration of the index. + fn config(&self) -> &SourceConfig; + + /// Returns the distance metric used by the index. + fn metric(&self) -> &DistanceMetric; + + /// Returns metadata about the index. + fn metadata(&self) -> &IndexMetadata; } /// Trait for vector index implementations. /// /// For each index algorithm, a separate struct and implementation -/// of this trait is required. Also, these are some fields that -/// should be included in the Index struct: +/// of this trait is required. Roughly, the index struct should look like: /// /// ```text +/// #[derive(Debug, Serialize, Deserialize)] /// struct Index{{ Algorithm }} { /// config: SourceConfig, +/// metric: DistanceMetric, +/// metadata: IndexMetadata, /// data: HashMap, -/// hidden: Vec, /// // Other fields... /// } /// ``` -pub trait VectorIndex: Debug + Serialize + DeserializeOwned { - /// Returns the configuration of the index. - fn config(&self) -> &SourceConfig; - - /// Returns the record IDs hidden from the result. - fn hidden(&self) -> &[RecordID]; - - /// Initializes the index. - fn new(source: SourceConfig) -> Self; - +pub trait VectorIndex: Debug { /// Trains the index based on the new records. /// /// If the index has been trained and not empty, this method @@ -104,6 +257,98 @@ pub trait VectorIndex: Debug + Serialize + DeserializeOwned { /// the index after a certain threshold of incremental fitting. fn refit(&mut self) -> Result<(), Error>; + /// Searches for the nearest neighbors based on the query vector. + /// - `query`: Query vector. + /// - `k`: Number of nearest neighbors to return. + fn search( + &self, + query: Vector, + k: usize, + ) -> Result, Error>; + + /// Searches the nearest neighbors based on the query vector and filters. + /// - `query`: Query vector. + /// - `k`: Number of nearest neighbors to return. + /// - `filters`: Filters to apply to the search results. + fn search_with_filters( + &self, + query: Vector, + k: usize, + filters: Filters, + ) -> Result, Error>; + /// Hides certain records from the search result permanently. fn hide(&mut self, record_ids: Vec) -> Result<(), Error>; + + /// Returns the index as Any type for dynamic casting. This method + /// allows the index to be downcast to a specific index struct to + /// be serialized and stored in a file. + fn as_any(&self) -> &dyn Any; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_source_config_new() { + let config = SourceConfig::new("table", "id", "embedding"); + let query = config.to_query(); + assert_eq!(query, "SELECT id, embedding FROM table"); + } + + #[test] + fn test_source_config_new_complete() { + let config = SourceConfig::new("table", "id", "embedding") + .with_metadata(vec!["metadata"]) + .with_filter("id > 100"); + + let query = config.to_query(); + let expected = + "SELECT id, embedding, metadata FROM table WHERE id > 100"; + assert_eq!(query, expected); + } +} + +#[cfg(test)] +mod index_tests { + use super::*; + + pub fn populate_index(index: &mut impl VectorIndex) { + let mut records = HashMap::new(); + for i in 0..100 { + let id = RecordID(i as u32); + let vector = Vector::from(vec![i as f32; 128]); + let data = HashMap::from([( + "number".to_string(), + Some(RecordData::Integer(1000 + i)), + )]); + + let record = Record { vector, data }; + records.insert(id, record); + } + + index.fit(records).unwrap(); + } + + pub fn test_search(index: &impl VectorIndex) { + let query = Vector::from(vec![0.0; 128]); + let k = 10; + let results = index.search(query, k).unwrap(); + + assert_eq!(results.len(), k); + assert_eq!(results[0].id, RecordID(0)); + assert_eq!(results[0].distance, 0.0); + assert_eq!(results[9].id, RecordID(9)); + } + + pub fn test_search_with_filters(index: &impl VectorIndex) { + let query = Vector::from(vec![0.0; 128]); + let k = 10; + let filters = Filters::from("number > 1010"); + let results = index.search_with_filters(query, k, filters).unwrap(); + + assert_eq!(results.len(), k); + assert_eq!(results[0].id, RecordID(11)); + } } diff --git a/src/indices/type_algorithm.rs b/src/indices/type_algorithm.rs new file mode 100644 index 00000000..6b771e8e --- /dev/null +++ b/src/indices/type_algorithm.rs @@ -0,0 +1,53 @@ +use super::*; + +/// Algorithm options used to index and search vectors. +#[allow(missing_docs)] +#[derive(Debug, PartialEq, Eq)] +#[derive(Serialize, Deserialize)] +pub enum IndexAlgorithm { + BruteForce, +} + +impl IndexAlgorithm { + /// Initializes a new index based on the algorithm and configuration. + pub(crate) fn initialize( + &self, + config: SourceConfig, + metric: DistanceMetric, + ) -> Box { + let index = match self { + IndexAlgorithm::BruteForce => IndexBruteForce::new(config, metric), + }; + + Box::new(index) + } + + /// Persists the index to a file based on the algorithm. + /// - `path`: Path to the file where the index will be stored. + /// - `index`: Index to persist as a trait object. + pub(crate) fn persist_index( + &self, + path: impl AsRef, + index: Box, + ) -> Result<(), Error> { + match self { + IndexAlgorithm::BruteForce => { + Self::_persist_index::(path, index) + } + } + } + + fn _persist_index( + path: impl AsRef, + index: Box, + ) -> Result<(), Error> { + let index = index.as_any().downcast_ref::().ok_or_else(|| { + let code = ErrorCode::InternalError; + let message = "Failed to downcast index to concrete type."; + Error::new(code, message) + })?; + + index.persist(path)?; + Ok(()) + } +} diff --git a/src/indices/type_distance.rs b/src/indices/type_distance.rs new file mode 100644 index 00000000..56df0787 --- /dev/null +++ b/src/indices/type_distance.rs @@ -0,0 +1,49 @@ +use super::*; +use simsimd::SpatialSimilarity; + +/// Distance metric used to compare vectors in the index. +#[derive(Debug, Serialize, Deserialize)] +pub enum DistanceMetric { + /// Squared [Euclidean distance](https://www.geeksforgeeks.org/euclidean-distance) + /// + /// The squared Euclidean distance is used to avoid the square + /// root operation thus making the computation faster. + Euclidean, + /// Cosine distance (1 - cosine similarity): + /// [Cosine similarity](https://www.geeksforgeeks.org/cosine-similarity/) + Cosine, +} + +impl DistanceMetric { + /// Computes the distance between two vectors. + pub fn distance(&self, a: &Vector, b: &Vector) -> f32 { + let a = &a.to_vec(); + let b = &b.to_vec(); + + let dist = match self { + DistanceMetric::Euclidean => f32::sqeuclidean(a, b), + DistanceMetric::Cosine => f32::cosine(a, b), + }; + + dist.unwrap() as f32 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_distance_metric() { + let a = Vector::from(vec![1.0, 3.0, 5.0]); + let b = Vector::from(vec![2.0, 4.0, 6.0]); + + let metric = DistanceMetric::Euclidean; + let dist = metric.distance(&a, &b); + assert_eq!(dist, 3.0); + + let metric = DistanceMetric::Cosine; + let dist = metric.distance(&a, &b); + assert!(dist <= 0.01); + } +} diff --git a/src/indices/type_filter.rs b/src/indices/type_filter.rs new file mode 100644 index 00000000..c6a5a000 --- /dev/null +++ b/src/indices/type_filter.rs @@ -0,0 +1,237 @@ +#![allow(missing_docs)] + +use super::*; + +/// Joined multiple filters operation with either AND or OR. +/// +/// The chosen join will be applied to the filters monotonically. +/// So, it's not possible to mix AND and OR filters in the same operation. +#[derive(Debug, PartialEq)] +pub enum Filters { + NONE, + AND(Vec), + OR(Vec), +} + +impl Filters { + pub fn apply( + &self, + data: &HashMap>, + ) -> bool { + match self { + Filters::NONE => true, + Filters::AND(filters) => filters.iter().all(|f| f.apply(data)), + Filters::OR(filters) => filters.iter().any(|f| f.apply(data)), + } + } +} + +impl From<&str> for Filters { + fn from(value: &str) -> Self { + if value.is_empty() { + return Filters::NONE; + } + + const OR: &str = " OR "; + const AND: &str = " AND "; + + // Check which join operator is used. + let or_count = value.matches(OR).count(); + let and_count = value.matches(AND).count(); + + if or_count > 0 && and_count > 0 { + panic!("Mixing AND and OR join operators is not supported."); + } + + let join = if or_count > 0 { OR } else { AND }; + let filters = value.split(join).map(Into::into).collect(); + match join { + OR => Filters::OR(filters), + _ => Filters::AND(filters), + } + } +} + +/// Record metadata filter. +/// +/// Using the filter operator, the record metadata can be compared against +/// a specific value to determine if it should be included in the results. +#[derive(Debug, PartialEq)] +pub struct Filter { + pub column: ColumnName, + pub value: RecordData, + pub operator: FilterOperator, +} + +impl Filter { + pub fn apply( + &self, + data: &HashMap>, + ) -> bool { + let value = match data.get(&self.column).unwrap_or(&None) { + Some(value) => value, + None => return false, + }; + + // This alias helps us cut down lines of code. + type Type = RecordData; + match (value, &self.value) { + (Type::Boolean(a), Type::Boolean(b)) => self.match_boolean(*a, *b), + (Type::Float(a), Type::Float(b)) => self.match_number(a, b), + (Type::Integer(a), Type::Integer(b)) => self.match_number(a, b), + (Type::String(a), Type::String(b)) => self.match_string(a, b), + _ => false, + } + } + + fn match_boolean(&self, a: bool, b: bool) -> bool { + match self.operator { + FilterOperator::Equal => a == b, + FilterOperator::NotEqual => a != b, + _ => false, + } + } + + fn match_number(&self, a: T, b: T) -> bool { + match self.operator { + FilterOperator::Equal => a == b, + FilterOperator::NotEqual => a != b, + FilterOperator::GreaterThan => a > b, + FilterOperator::GreaterThanOrEqual => a >= b, + FilterOperator::LessThan => a < b, + FilterOperator::LessThanOrEqual => a <= b, + _ => false, + } + } + + fn match_string(&self, a: &str, b: &str) -> bool { + match self.operator { + FilterOperator::Contain => a.contains(b), + FilterOperator::Equal => a == b, + FilterOperator::NotEqual => a != b, + _ => false, + } + } +} + +impl From<&str> for Filter { + fn from(value: &str) -> Self { + if value.is_empty() { + panic!("Filter string cannot be empty."); + } + + // Split the filter string into EXACTLY 3 parts. + let parts: Vec<&str> = value.splitn(3, ' ').collect(); + let parts: Vec<&str> = parts.into_iter().map(|p| p.trim()).collect(); + + let column = parts[0].to_string(); + let operator = FilterOperator::from(parts[1]); + let value = RecordData::from(parts[2]); + Filter { column, value, operator } + } +} + +/// Filter operator. +/// +/// Some of the operators are only applicable to specific data types. +/// - Contain is only applicable to string data type. +/// - Equal and NotEqual is applicable to all data types. +/// - The rest are applicable to integer and float data types. +#[derive(Debug, PartialEq, Eq)] +pub enum FilterOperator { + Contain, + Equal, + NotEqual, + GreaterThan, + GreaterThanOrEqual, + LessThan, + LessThanOrEqual, +} + +impl From<&str> for FilterOperator { + fn from(value: &str) -> Self { + match value { + "CONTAINS" => FilterOperator::Contain, + "=" => FilterOperator::Equal, + "!=" => FilterOperator::NotEqual, + ">" => FilterOperator::GreaterThan, + ">=" => FilterOperator::GreaterThanOrEqual, + "<" => FilterOperator::LessThan, + "<=" => FilterOperator::LessThanOrEqual, + _ => panic!("Invalid filter operator: {}", value), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn create_test_data() -> HashMap> { + let columns = vec!["name", "age", "gpa", "active"]; + let values: Vec = vec![ + "Alice".into(), + RecordData::Integer(20), + RecordData::Float(3.5), + RecordData::Boolean(true), + ]; + + let mut data = HashMap::new(); + for (column, value) in columns.into_iter().zip(values.into_iter()) { + data.insert(column.to_string(), Some(value)); + } + + data + } + + #[test] + fn test_filters_from_string() { + let filters = Filters::from("name CONTAINS Ada"); + let expected = Filters::AND(vec![Filter { + column: "name".to_string(), + value: "Ada".into(), + operator: FilterOperator::Contain, + }]); + + assert_eq!(filters, expected); + + let filters = Filters::from("gpa >= 3.0 OR age < 21"); + let expected = { + let filter_gpa = Filter { + column: "gpa".to_string(), + value: RecordData::Float(3.0), + operator: FilterOperator::GreaterThanOrEqual, + }; + + let filter_age = Filter { + column: "age".to_string(), + value: RecordData::Integer(21), + operator: FilterOperator::LessThan, + }; + + Filters::OR(vec![filter_gpa, filter_age]) + }; + + assert_eq!(filters, expected); + } + + #[test] + fn test_filters_apply() { + let data = create_test_data(); + + let filters = Filters::from("name CONTAINS Alice"); + assert!(filters.apply(&data)); + + let filters = Filters::from("name = Bob"); + assert!(!filters.apply(&data)); + + let filters = Filters::from("age >= 20 AND gpa < 4.0"); + assert!(filters.apply(&data)); + + let filters = Filters::from("age >= 20 AND gpa < 3.0"); + assert!(!filters.apply(&data)); + + let filters = Filters::from("active = true"); + assert!(filters.apply(&data)); + } +} diff --git a/src/lib.rs b/src/lib.rs index f61f8a26..76cc96a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,9 +4,6 @@ #![doc(html_favicon_url = "https://i.postimg.cc/W3T230zk/favicon.png")] #![doc(html_logo_url = "https://i.postimg.cc/Vv0HPVwB/logo.png")] -#[cfg(test)] -mod tests; - /// Primary module for vector database operations. pub mod db; /// Module for managing database indices and related types. diff --git a/src/prelude/mod.rs b/src/prelude/mod.rs index 5351c872..3cb3f6a8 100644 --- a/src/prelude/mod.rs +++ b/src/prelude/mod.rs @@ -1,3 +1,4 @@ pub use crate::db::Database; -pub use crate::indices::SourceConfig; -pub use crate::types::err::{Error, ErrorCode}; +pub use crate::indices::*; +pub use crate::types::err::*; +pub use crate::types::record::*; diff --git a/src/tests/mod.rs b/src/tests/mod.rs deleted file mode 100644 index 5056dc21..00000000 --- a/src/tests/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -use crate::prelude::*; -use crate::types::file; -use std::path::PathBuf; - -mod test_database; - -fn create_test_database() -> Result { - let path = PathBuf::from("odb_data"); - let source_url = { - let db_path = file::get_tmp_dir()?.join("sqlite.db"); - Some(format!("sqlite://{}?mode=rwc", db_path.display())) - }; - - let db = Database::open(path, source_url)?; - let state = db.state(); - assert_eq!(state.source_type(), "sqlite"); - Ok(db) -} diff --git a/src/tests/test_database.rs b/src/tests/test_database.rs deleted file mode 100644 index 42c9a5ea..00000000 --- a/src/tests/test_database.rs +++ /dev/null @@ -1,6 +0,0 @@ -use super::*; - -#[test] -fn test_database_open() { - assert!(create_test_database().is_ok()); -} diff --git a/src/types/conn.rs b/src/types/conn.rs new file mode 100644 index 00000000..efdd72c8 --- /dev/null +++ b/src/types/conn.rs @@ -0,0 +1,2 @@ +pub use sqlx::AnyConnection as SourceConnection; +pub use sqlx::Connection; diff --git a/src/types/err.rs b/src/types/err.rs index 322e35bb..5013b6a1 100644 --- a/src/types/err.rs +++ b/src/types/err.rs @@ -2,6 +2,7 @@ use std::fmt::{Display, Formatter, Result}; // External error types. use bincode::Error as BincodeError; +use serde_json::Error as JSONError; use sqlx::Error as SQLError; use std::error::Error as StandardError; use std::io::Error as IOError; @@ -13,6 +14,14 @@ pub enum ErrorCode { InvalidSource, MissingSource, + // Data type related. + InvalidID, + InvalidVector, + InvalidMetadata, + + // Other generic errors. + InternalError, + // External error types. FileError, SerializationError, @@ -65,3 +74,10 @@ impl From for Error { Error::new(code, err.to_string()) } } + +impl From for Error { + fn from(err: JSONError) -> Self { + let code = ErrorCode::SerializationError; + Error::new(code, err.to_string()) + } +} diff --git a/src/types/mod.rs b/src/types/mod.rs index 47b3284f..421c86e9 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,6 +1,9 @@ /// Custom error types of OasysDB. pub mod err; +/// Vector record types for indices. +pub mod record; + +/// SQL database connection types. +pub(crate) mod conn; /// File operation utilities. pub(crate) mod file; -/// Vector record types for indices. -pub(crate) mod record; diff --git a/src/types/record.rs b/src/types/record.rs index 3b406913..4c6a7ea5 100644 --- a/src/types/record.rs +++ b/src/types/record.rs @@ -1,42 +1,198 @@ +use crate::types::err::{Error, ErrorCode}; use half::f16; use serde::{Deserialize, Serialize}; +use sqlx::any::AnyRow; +use sqlx::database::HasValueRef; +use sqlx::{Database, Decode, Row, Type}; use std::collections::HashMap; -use uuid::Uuid; +use std::error::Error as StandardError; /// Column name of the SQL data source table. pub type ColumnName = String; /// ID type for records in the index from the data source. #[allow(clippy::upper_case_acronyms)] -#[derive(Debug, Serialize, Deserialize)] -#[derive(PartialEq, Eq, Hash)] -pub enum RecordID { - /// Auto-incrementing integer ID (Most efficient). - Integer(usize), - /// String as ID (Not efficient). - String(String), - /// Universally Unique ID (Less efficient). - UUID(Uuid), -} +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +#[derive(Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct RecordID(pub u32); /// Record type stored in the index based on the /// configuration and data source. #[derive(Debug, Serialize, Deserialize)] pub struct Record { - vector: Vector, - data: Option>, + /// Vector embedding. + pub vector: Vector, + /// Additional metadata of the record. + pub data: HashMap>, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] /// Vector data type stored in the index. -pub struct Vector(Vec); +pub struct Vector(pub Vec); + +impl Vector { + /// Returns the vector data as a vector of f32. + pub fn to_vec(&self) -> Vec { + self.0.clone().into_iter().map(f16::to_f32).collect() + } +} + +impl From> for Vector { + fn from(value: Vec) -> Self { + Vector(value.into_iter().map(f16::from_f32).collect()) + } +} /// Data types supported as metadata in the index. #[allow(missing_docs)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum RecordData { Boolean(bool), Float(f32), Integer(usize), String(String), } + +// RecordData interoperability with primitive types. + +impl From for RecordData { + fn from(value: String) -> Self { + RecordData::from(value.as_str()) + } +} + +impl From<&str> for RecordData { + fn from(value: &str) -> Self { + // Parsing integer must be done before float. + // Since integer can be parsed as float but not vice versa. + if let Ok(integer) = value.parse::() { + return integer.into(); + } + + if let Ok(float) = value.parse::() { + return float.into(); + } + + if let Ok(boolean) = value.parse::() { + return boolean.into(); + } + + RecordData::String(value.to_string()) + } +} + +impl From for RecordData { + fn from(value: f32) -> Self { + RecordData::Float(value) + } +} + +impl From for RecordData { + fn from(value: usize) -> Self { + RecordData::Integer(value) + } +} + +impl From for RecordData { + fn from(value: bool) -> Self { + RecordData::Boolean(value) + } +} + +pub(crate) trait RowOps { + /// Retrieves data from the row based on the column name. + fn from_row( + column_name: impl Into, + row: &AnyRow, + ) -> Result + where + Self: Sized; +} + +impl RowOps for RecordID { + fn from_row( + column_name: impl Into, + row: &AnyRow, + ) -> Result { + let column_name: String = column_name.into(); + let id = row.try_get::(&column_name).map_err(|_| { + let code = ErrorCode::InvalidID; + let message = "Unable to get integer ID from the row."; + Error::new(code, message) + })?; + + Ok(RecordID(id as u32)) + } +} + +impl<'r, DB: Database> Decode<'r, DB> for Vector +where + &'r str: Decode<'r, DB>, +{ + fn decode( + value: >::ValueRef, + ) -> Result> { + let value = <&str as Decode>::decode(value)?; + let vector: Vec = serde_json::from_str(value)?; + Ok(Vector(vector.into_iter().map(f16::from_f32).collect())) + } +} + +impl Type for Vector +where + DB: Database, + &'static str: Type, +{ + fn type_info() -> DB::TypeInfo { + <&str as Type>::type_info() + } +} + +impl RowOps for Vector { + fn from_row( + column_name: impl Into, + row: &AnyRow, + ) -> Result { + let column: String = column_name.into(); + let vector = row.try_get::(&column).map_err(|_| { + let code = ErrorCode::InvalidVector; + let message = "Unable to get vector from the row."; + Error::new(code, message) + })?; + + Ok(vector) + } +} + +impl<'r, DB: Database> Decode<'r, DB> for RecordData +where + &'r str: Decode<'r, DB>, +{ + fn decode( + value: >::ValueRef, + ) -> Result> + { + let value = <&str as Decode>::decode(value)?; + Ok(RecordData::from(value)) + } +} + +impl Type for RecordData +where + DB: Database, + &'static str: Type, +{ + fn type_info() -> DB::TypeInfo { + <&str as Type>::type_info() + } +} + +impl RowOps for Option { + fn from_row( + column_name: impl Into, + row: &AnyRow, + ) -> Result { + let column: String = column_name.into(); + Ok(row.try_get::(&column).unwrap_or_default()) + } +} From c590e38f944f597c691eee5deb5c65250ab7ba97 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Fri, 12 Jul 2024 17:02:49 -0500 Subject: [PATCH 43/88] feat(test): add test create index database --- src/db/database.rs | 86 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 78 insertions(+), 8 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index 6d7681df..4f59342b 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -66,6 +66,8 @@ impl Database { /// Creates a new index in the database asynchronously. /// - `name`: Name of the index. + /// - `algorithm`: Indexing algorithm to use. + /// - `metric`: Distance metric for the index. /// - `config`: Index data source configuration. pub async fn async_create_index( &mut self, @@ -107,6 +109,19 @@ impl Database { Ok(()) } + /// Creates a new index in the database synchronously. + pub fn create_index( + &mut self, + name: impl Into, + algorithm: IndexAlgorithm, + metric: DistanceMetric, + config: SourceConfig, + ) -> Result<(), Error> { + executor::block_on( + self.async_create_index(name, algorithm, metric, config), + ) + } + /// Returns the state object of the database. pub fn state(&self) -> &DatabaseState { &self.state @@ -193,22 +208,77 @@ pub struct IndexRef { #[cfg(test)] mod tests { use super::*; + use sqlx::{Executor, Row}; + + #[test] + fn test_database_open() { + assert!(create_test_database().is_ok()); + } + + #[test] + fn test_database_create_index() { + let mut db = create_test_database().unwrap(); + + let name = "test_index"; + let algorithm = IndexAlgorithm::BruteForce; + let metric = DistanceMetric::Euclidean; + let config = SourceConfig::new("embeddings", "id", "vector") + .with_metadata(vec!["data"]); + + assert!(db.create_index(name, algorithm, metric, config).is_ok()); + } fn create_test_database() -> Result { let path = PathBuf::from("odb_data"); - let source_url = { - let db_path = file::get_tmp_dir()?.join("sqlite.db"); - Some(format!("sqlite://{}?mode=rwc", db_path.display())) - }; + if path.try_exists().is_ok() { + fs::remove_dir_all(&path)?; + } - let db = Database::open(path, source_url)?; + let db_path = file::get_tmp_dir()?.join("sqlite.db"); + let db_url = format!("sqlite://{}?mode=rwc", db_path.display()); + + let db = Database::open(path, Some(db_url.clone()))?; let state = db.state(); assert_eq!(state.source_type(), SourceType::SQLITE); + + executor::block_on(setup_test_source(db_url)).unwrap(); Ok(db) } - #[test] - fn test_database_open() { - assert!(create_test_database().is_ok()); + async fn setup_test_source(url: impl Into) -> Result<(), Error> { + let url: String = url.into(); + let mut conn = SourceConnection::connect(&url).await?; + + let create_table = "CREATE TABLE IF NOT EXISTS embeddings ( + id INTEGER PRIMARY KEY, + vector JSON NOT NULL, + data INTEGER NOT NULL + )"; + + let mut values = vec![]; + for i in 0..100 { + let vector = vec![i as f32; 128]; + let vector = serde_json::to_string(&vector)?; + let data = 1000 + i; + values.push(format!("({vector:?}, {data})")); + } + + let values = values.join(",\n"); + let insert_records = format!( + "INSERT INTO embeddings (vector, data) + VALUES {values}" + ); + + conn.execute("DROP TABLE IF EXISTS embeddings").await?; + conn.execute(create_table).await?; + conn.execute(insert_records.as_str()).await?; + + let count = conn + .fetch_one("SELECT COUNT(*) FROM embeddings") + .await? + .get::(0); + + assert_eq!(count, 100); + Ok(()) } } From 05ef824a8fc9b73560d5b6cb17460cab63b360d3 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Fri, 12 Jul 2024 17:56:31 -0500 Subject: [PATCH 44/88] feat: add create index test --- src/db/database.rs | 54 ++++++++++++++++++++++++++++------- src/indices/type_algorithm.rs | 2 +- src/types/record.rs | 10 +++++++ 3 files changed, 55 insertions(+), 11 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index 4f59342b..be551482 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -122,6 +122,14 @@ impl Database { ) } + /// Returns an index reference by name. + /// + /// This method is useful for deserializing and accessing + /// the index directly from the file based on the algorithm type. + pub fn get_index(&self, name: impl AsRef) -> Option<&IndexRef> { + self.state.indices.get(name.as_ref()) + } + /// Returns the state object of the database. pub fn state(&self) -> &DatabaseState { &self.state @@ -205,9 +213,20 @@ pub struct IndexRef { file: IndexFile, } +impl IndexRef { + pub fn algorithm(&self) -> &IndexAlgorithm { + &self.algorithm + } + + pub fn file(&self) -> &IndexFile { + &self.file + } +} + #[cfg(test)] mod tests { use super::*; + use crate::prelude::RecordID; use sqlx::{Executor, Row}; #[test] @@ -217,15 +236,13 @@ mod tests { #[test] fn test_database_create_index() { - let mut db = create_test_database().unwrap(); - - let name = "test_index"; - let algorithm = IndexAlgorithm::BruteForce; - let metric = DistanceMetric::Euclidean; - let config = SourceConfig::new("embeddings", "id", "vector") - .with_metadata(vec!["data"]); + let db = create_test_database().unwrap(); + let index_ref = db.get_index("test_index").unwrap(); + let index = IndexBruteForce::load(&index_ref.file()).unwrap(); - assert!(db.create_index(name, algorithm, metric, config).is_ok()); + let metadata = index.metadata(); + assert_eq!(metadata.count, 100); + assert_eq!(metadata.last_inserted, Some(RecordID(100))); } fn create_test_database() -> Result { @@ -237,14 +254,31 @@ mod tests { let db_path = file::get_tmp_dir()?.join("sqlite.db"); let db_url = format!("sqlite://{}?mode=rwc", db_path.display()); - let db = Database::open(path, Some(db_url.clone()))?; + let mut db = Database::open(path, Some(db_url.clone()))?; let state = db.state(); assert_eq!(state.source_type(), SourceType::SQLITE); - executor::block_on(setup_test_source(db_url)).unwrap(); + executor::block_on(setup_test_source(db_url))?; + create_test_index(&mut db)?; Ok(db) } + fn create_test_index(db: &mut Database) -> Result<(), Error> { + let config = SourceConfig::new("embeddings", "id", "vector") + .with_metadata(vec!["data"]); + + db.create_index( + "test_index", + IndexAlgorithm::BruteForce, + DistanceMetric::Euclidean, + config, + )?; + + let index = db.get_index("test_index").unwrap(); + assert_eq!(index.algorithm(), &IndexAlgorithm::BruteForce); + Ok(()) + } + async fn setup_test_source(url: impl Into) -> Result<(), Error> { let url: String = url.into(); let mut conn = SourceConnection::connect(&url).await?; diff --git a/src/indices/type_algorithm.rs b/src/indices/type_algorithm.rs index 6b771e8e..3937ba44 100644 --- a/src/indices/type_algorithm.rs +++ b/src/indices/type_algorithm.rs @@ -5,7 +5,7 @@ use super::*; #[derive(Debug, PartialEq, Eq)] #[derive(Serialize, Deserialize)] pub enum IndexAlgorithm { - BruteForce, + BruteForce, // -> IndexBruteForce } impl IndexAlgorithm { diff --git a/src/types/record.rs b/src/types/record.rs index 4c6a7ea5..7a360c53 100644 --- a/src/types/record.rs +++ b/src/types/record.rs @@ -35,6 +35,16 @@ impl Vector { pub fn to_vec(&self) -> Vec { self.0.clone().into_iter().map(f16::to_f32).collect() } + + /// Returns the dimension of the vector. + pub fn len(&self) -> usize { + self.0.len() + } + + /// Checks if the vector is empty. + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } } impl From> for Vector { From b9458e923e932f05b6fc620f415a021f487ce778 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Fri, 12 Jul 2024 20:39:06 -0500 Subject: [PATCH 45/88] feat: add refresh index method --- src/db/database.rs | 144 ++++++++++++++++++++++++++++------ src/indices/ix_bruteforce.rs | 4 +- src/indices/mod.rs | 35 ++++++--- src/indices/type_algorithm.rs | 21 ++++- src/types/err.rs | 1 + src/types/record.rs | 2 +- 6 files changed, 172 insertions(+), 35 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index be551482..b78ec0a3 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -3,6 +3,7 @@ use futures::executor; use futures::stream::StreamExt; use sqlx::any::install_default_drivers; use sqlx::Acquire; +use sqlx::Executor; use url::Url; use uuid::Uuid; @@ -126,10 +127,73 @@ impl Database { /// /// This method is useful for deserializing and accessing /// the index directly from the file based on the algorithm type. - pub fn get_index(&self, name: impl AsRef) -> Option<&IndexRef> { + pub fn get_index_ref(&self, name: impl AsRef) -> Option<&IndexRef> { self.state.indices.get(name.as_ref()) } + /// Retrieves an index from the file and returns it as a trait object. + /// - `name`: Index name. + pub fn get_index( + &self, + name: impl AsRef, + ) -> Option> { + let IndexRef { algorithm, file } = self.get_index_ref(name)?; + algorithm.load_index(file).ok() + } + + /// Updates the index with new records from the source asynchronously. + /// - `name`: Index name. + /// + /// This method checks the index for the last inserted record and queries + /// the source database for new records after that checkpoint. It then + /// updates the index with the new records. + pub async fn async_refresh_index( + &mut self, + name: impl AsRef, + ) -> Result<(), Error> { + let name = name.as_ref(); + let index_ref = self.get_index_ref(name).ok_or_else(|| { + let code = ErrorCode::NotFound; + let message = format!("Index not found: {name}."); + Error::new(code, message) + })?; + + // Cloning is necessary here to avoid borrowing issues. + let IndexRef { algorithm, file } = index_ref.to_owned(); + + // It's safe to unwrap here because we validated that index exists by + // calling get_index_ref method above. + let mut index = self.get_index(name).unwrap(); + + let query = { + let meta = index.metadata(); + let checkpoint = meta.last_inserted.unwrap_or_default(); + index.config().to_query_after(&checkpoint) + }; + + let conn = self.conn.acquire().await?; + let mut stream = sqlx::query(&query).fetch(conn); + + let mut records = HashMap::new(); + while let Some(row) = stream.next().await { + let row = row?; + let (id, record) = index.config().to_record(&row)?; + records.insert(id, record); + } + + index.fit(records)?; + algorithm.persist_index(file, index)?; + Ok(()) + } + + /// Updates the index with new records from the source synchronously. + pub fn refresh_index( + &mut self, + name: impl AsRef, + ) -> Result<(), Error> { + executor::block_on(self.async_refresh_index(name)) + } + /// Returns the state object of the database. pub fn state(&self) -> &DatabaseState { &self.state @@ -150,6 +214,16 @@ impl Database { fn indices_dir(&self) -> PathBuf { self.root.join("indices") } + + #[allow(dead_code)] + async fn async_execute_sql( + &mut self, + query: impl AsRef, + ) -> Result<(), Error> { + let conn = self.conn.acquire().await?; + conn.execute(query.as_ref()).await?; + Ok(()) + } } /// The state of the vector database. @@ -207,7 +281,7 @@ impl DatabaseState { } /// Details about the index and where it is stored. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct IndexRef { algorithm: IndexAlgorithm, file: IndexFile, @@ -229,6 +303,9 @@ mod tests { use crate::prelude::RecordID; use sqlx::{Executor, Row}; + const TABLE: &str = "embeddings"; + const TEST_INDEX: &str = "test_index"; + #[test] fn test_database_open() { assert!(create_test_database().is_ok()); @@ -237,14 +314,29 @@ mod tests { #[test] fn test_database_create_index() { let db = create_test_database().unwrap(); - let index_ref = db.get_index("test_index").unwrap(); - let index = IndexBruteForce::load(&index_ref.file()).unwrap(); + let index = db.get_index(TEST_INDEX).unwrap(); let metadata = index.metadata(); + assert_eq!(metadata.count, 100); assert_eq!(metadata.last_inserted, Some(RecordID(100))); } + #[test] + fn test_database_refresh_index() { + let mut db = create_test_database().unwrap(); + let query = generate_insert_query(100, 10); + executor::block_on(db.async_execute_sql(query)).unwrap(); + + db.refresh_index(TEST_INDEX).unwrap(); + + let index = db.get_index(TEST_INDEX).unwrap(); + let metadata = index.metadata(); + + assert_eq!(metadata.count, 110); + assert_eq!(metadata.last_inserted, Some(RecordID(110))); + } + fn create_test_database() -> Result { let path = PathBuf::from("odb_data"); if path.try_exists().is_ok() { @@ -264,47 +356,55 @@ mod tests { } fn create_test_index(db: &mut Database) -> Result<(), Error> { - let config = SourceConfig::new("embeddings", "id", "vector") + let config = SourceConfig::new(TABLE, "id", "vector") .with_metadata(vec!["data"]); db.create_index( - "test_index", + TEST_INDEX, IndexAlgorithm::BruteForce, DistanceMetric::Euclidean, config, )?; - let index = db.get_index("test_index").unwrap(); - assert_eq!(index.algorithm(), &IndexAlgorithm::BruteForce); + let index_ref = db.get_index_ref(TEST_INDEX).unwrap(); + assert_eq!(index_ref.algorithm(), &IndexAlgorithm::BruteForce); Ok(()) } - async fn setup_test_source(url: impl Into) -> Result<(), Error> { - let url: String = url.into(); - let mut conn = SourceConnection::connect(&url).await?; - - let create_table = "CREATE TABLE IF NOT EXISTS embeddings ( - id INTEGER PRIMARY KEY, - vector JSON NOT NULL, - data INTEGER NOT NULL - )"; + fn generate_insert_query(start: u8, count: u8) -> String { + let start = start as u16; + let end = start + count as u16; let mut values = vec![]; - for i in 0..100 { + for i in start..end { let vector = vec![i as f32; 128]; - let vector = serde_json::to_string(&vector)?; + let vector = serde_json::to_string(&vector).unwrap(); let data = 1000 + i; values.push(format!("({vector:?}, {data})")); } let values = values.join(",\n"); - let insert_records = format!( - "INSERT INTO embeddings (vector, data) + format!( + "INSERT INTO {TABLE} (vector, data) VALUES {values}" + ) + } + + async fn setup_test_source(url: impl Into) -> Result<(), Error> { + let url: String = url.into(); + let mut conn = SourceConnection::connect(&url).await?; + + let create_table = format!( + "CREATE TABLE IF NOT EXISTS {TABLE} ( + id INTEGER PRIMARY KEY, + vector JSON NOT NULL, + data INTEGER NOT NULL)" ); + let insert_records = generate_insert_query(0, 100); + conn.execute("DROP TABLE IF EXISTS embeddings").await?; - conn.execute(create_table).await?; + conn.execute(create_table.as_str()).await?; conn.execute(insert_records.as_str()).await?; let count = conn diff --git a/src/indices/ix_bruteforce.rs b/src/indices/ix_bruteforce.rs index 5553f205..c417272e 100644 --- a/src/indices/ix_bruteforce.rs +++ b/src/indices/ix_bruteforce.rs @@ -23,7 +23,9 @@ impl IndexOps for IndexBruteForce { data: HashMap::new(), } } +} +impl VectorIndex for IndexBruteForce { fn config(&self) -> &SourceConfig { &self.config } @@ -35,9 +37,7 @@ impl IndexOps for IndexBruteForce { fn metadata(&self) -> &IndexMetadata { &self.metadata } -} -impl VectorIndex for IndexBruteForce { fn fit(&mut self, records: HashMap) -> Result<(), Error> { if records.is_empty() { return Ok(()); diff --git a/src/indices/mod.rs b/src/indices/mod.rs index 9e6c50f7..f498d9cc 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -135,6 +135,23 @@ impl SourceConfig { query.trim().to_string() } + /// Generates a SQL query string based on the configuration and checkpoint. + /// Instead of returning a query to fetch all records, this method returns + /// a query to fetch records from a specific RecordID. + /// - `checkpoint`: Record ID to start the query from. + pub(crate) fn to_query_after(&self, checkpoint: &RecordID) -> String { + let table = &self.table; + let columns = self.columns().join(", "); + + let mut filter = format!("WHERE id > {}", checkpoint.0); + if let Some(string) = &self.filter { + filter.push_str(&format!(" AND ({string})")); + } + + let query = format!("SELECT {columns} FROM {table} {filter}"); + query.trim().to_string() + } + /// Creates a tuple of record ID and record data from a row. pub(crate) fn to_record( &self, @@ -216,15 +233,6 @@ pub trait IndexOps: Debug + Serialize + DeserializeOwned { fn persist(&self, path: impl AsRef) -> Result<(), Error> { file::write_binary_file(path, self) } - - /// Returns the configuration of the index. - fn config(&self) -> &SourceConfig; - - /// Returns the distance metric used by the index. - fn metric(&self) -> &DistanceMetric; - - /// Returns metadata about the index. - fn metadata(&self) -> &IndexMetadata; } /// Trait for vector index implementations. @@ -243,6 +251,15 @@ pub trait IndexOps: Debug + Serialize + DeserializeOwned { /// } /// ``` pub trait VectorIndex: Debug { + /// Returns the configuration of the index. + fn config(&self) -> &SourceConfig; + + /// Returns the distance metric used by the index. + fn metric(&self) -> &DistanceMetric; + + /// Returns metadata about the index. + fn metadata(&self) -> &IndexMetadata; + /// Trains the index based on the new records. /// /// If the index has been trained and not empty, this method diff --git a/src/indices/type_algorithm.rs b/src/indices/type_algorithm.rs index 3937ba44..31fa3d13 100644 --- a/src/indices/type_algorithm.rs +++ b/src/indices/type_algorithm.rs @@ -2,7 +2,7 @@ use super::*; /// Algorithm options used to index and search vectors. #[allow(missing_docs)] -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone)] #[derive(Serialize, Deserialize)] pub enum IndexAlgorithm { BruteForce, // -> IndexBruteForce @@ -22,6 +22,18 @@ impl IndexAlgorithm { Box::new(index) } + pub(crate) fn load_index( + &self, + path: impl AsRef, + ) -> Result, Error> { + match self { + IndexAlgorithm::BruteForce => { + let index = Self::_load_index::(path)?; + Ok(Box::new(index)) + } + } + } + /// Persists the index to a file based on the algorithm. /// - `path`: Path to the file where the index will be stored. /// - `index`: Index to persist as a trait object. @@ -37,6 +49,13 @@ impl IndexAlgorithm { } } + fn _load_index( + path: impl AsRef, + ) -> Result { + let index = T::load(path)?; + Ok(index) + } + fn _persist_index( path: impl AsRef, index: Box, diff --git a/src/types/err.rs b/src/types/err.rs index 5013b6a1..76321e80 100644 --- a/src/types/err.rs +++ b/src/types/err.rs @@ -21,6 +21,7 @@ pub enum ErrorCode { // Other generic errors. InternalError, + NotFound, // External error types. FileError, diff --git a/src/types/record.rs b/src/types/record.rs index 7a360c53..57dbf0d4 100644 --- a/src/types/record.rs +++ b/src/types/record.rs @@ -13,7 +13,7 @@ pub type ColumnName = String; /// ID type for records in the index from the data source. #[allow(clippy::upper_case_acronyms)] #[derive(Debug, Clone, Copy, Serialize, Deserialize)] -#[derive(Hash, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Hash, PartialEq, Eq, PartialOrd, Ord, Default)] pub struct RecordID(pub u32); /// Record type stored in the index based on the From 61f08a83ec0ab6e8d9f8c6fb38354124f1daaf81 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 13 Jul 2024 18:14:06 -0500 Subject: [PATCH 46/88] feat: add search index method --- src/db/database.rs | 40 ++++++++++++++++++++++++++++++++++++++++ src/db/mod.rs | 1 + 2 files changed, 41 insertions(+) diff --git a/src/db/database.rs b/src/db/database.rs index b78ec0a3..47df4ed5 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -141,6 +141,21 @@ impl Database { algorithm.load_index(file).ok() } + /// Retrieves an index from the file and if found, returns it as a + /// trait object. Otherwise, returns a not found error. + /// - `name`: Index name. + pub fn try_get_index( + &self, + name: impl AsRef, + ) -> Result, Error> { + let name = name.as_ref(); + self.get_index(name).ok_or_else(|| { + let code = ErrorCode::NotFound; + let message = format!("Index not found in database: {name}."); + Error::new(code, message) + }) + } + /// Updates the index with new records from the source asynchronously. /// - `name`: Index name. /// @@ -194,6 +209,20 @@ impl Database { executor::block_on(self.async_refresh_index(name)) } + /// Searches the index for the nearest vectors to the query vector. + /// - `name`: Index name. + /// - `query`: Query vector. + /// - `k`: Number of nearest neighbors to return. + pub fn search_index( + &self, + name: impl AsRef, + query: impl Into, + k: usize, + ) -> Result, Error> { + let index = self.try_get_index(name)?; + index.search(query.into(), k) + } + /// Returns the state object of the database. pub fn state(&self) -> &DatabaseState { &self.state @@ -337,6 +366,17 @@ mod tests { assert_eq!(metadata.last_inserted, Some(RecordID(110))); } + #[test] + fn test_database_search_index() { + let db = create_test_database().unwrap(); + let query = vec![0.0; 128]; + let results = db.search_index(TEST_INDEX, query, 5).unwrap(); + + assert_eq!(results.len(), 5); + assert_eq!(results[0].id, RecordID(1)); + assert_eq!(results[0].distance, 0.0); + } + fn create_test_database() -> Result { let path = PathBuf::from("odb_data"); if path.try_exists().is_ok() { diff --git a/src/db/mod.rs b/src/db/mod.rs index 059b34d5..fc90ec40 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -2,6 +2,7 @@ use crate::indices::*; use crate::types::conn::*; use crate::types::err::*; use crate::types::file; +use crate::types::record::Vector; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::fs; From 88edf1228d3be39c73a21daa5fabc779a4a196a3 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sun, 14 Jul 2024 17:41:22 -0500 Subject: [PATCH 47/88] feat: add search with filters and sql to metadata type improvement --- src/db/database.rs | 29 ++++++++++ src/types/record.rs | 128 ++++++++++++++++++++++++-------------------- 2 files changed, 98 insertions(+), 59 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index 47df4ed5..4ba1575a 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -223,6 +223,22 @@ impl Database { index.search(query.into(), k) } + /// Searches the index for nearest neighbors with post-search filters. + /// - `name`: Index name. + /// - `query`: Query vector. + /// - `k`: Number of nearest neighbors to return. + /// - `filters`: SQL-like filters to apply. + pub fn search_index_with_filters( + &self, + name: impl AsRef, + query: impl Into, + k: usize, + filters: impl Into, + ) -> Result, Error> { + let index = self.try_get_index(name)?; + index.search_with_filters(query.into(), k, filters.into()) + } + /// Returns the state object of the database. pub fn state(&self) -> &DatabaseState { &self.state @@ -377,6 +393,19 @@ mod tests { assert_eq!(results[0].distance, 0.0); } + #[test] + fn test_database_search_index_with_filters() { + let db = create_test_database().unwrap(); + let query = vec![0.0; 128]; + let filters = Filters::from("data >= 1050"); + let results = db + .search_index_with_filters(TEST_INDEX, query, 5, filters) + .unwrap(); + + assert_eq!(results.len(), 5); + assert_eq!(results[0].id, RecordID(51)); + } + fn create_test_database() -> Result { let path = PathBuf::from("odb_data"); if path.try_exists().is_ok() { diff --git a/src/types/record.rs b/src/types/record.rs index 57dbf0d4..c59bc9a3 100644 --- a/src/types/record.rs +++ b/src/types/record.rs @@ -2,10 +2,9 @@ use crate::types::err::{Error, ErrorCode}; use half::f16; use serde::{Deserialize, Serialize}; use sqlx::any::AnyRow; -use sqlx::database::HasValueRef; -use sqlx::{Database, Decode, Row, Type}; +use sqlx::postgres::any::AnyTypeInfoKind as SQLType; +use sqlx::{Row, ValueRef}; use std::collections::HashMap; -use std::error::Error as StandardError; /// Column name of the SQL data source table. pub type ColumnName = String; @@ -59,7 +58,7 @@ impl From> for Vector { pub enum RecordData { Boolean(bool), Float(f32), - Integer(usize), + Integer(isize), String(String), } @@ -75,7 +74,7 @@ impl From<&str> for RecordData { fn from(value: &str) -> Self { // Parsing integer must be done before float. // Since integer can be parsed as float but not vice versa. - if let Ok(integer) = value.parse::() { + if let Ok(integer) = value.parse::() { return integer.into(); } @@ -97,8 +96,8 @@ impl From for RecordData { } } -impl From for RecordData { - fn from(value: usize) -> Self { +impl From for RecordData { + fn from(value: isize) -> Self { RecordData::Integer(value) } } @@ -135,65 +134,38 @@ impl RowOps for RecordID { } } -impl<'r, DB: Database> Decode<'r, DB> for Vector -where - &'r str: Decode<'r, DB>, -{ - fn decode( - value: >::ValueRef, - ) -> Result> { - let value = <&str as Decode>::decode(value)?; - let vector: Vec = serde_json::from_str(value)?; - Ok(Vector(vector.into_iter().map(f16::from_f32).collect())) - } -} - -impl Type for Vector -where - DB: Database, - &'static str: Type, -{ - fn type_info() -> DB::TypeInfo { - <&str as Type>::type_info() - } -} - impl RowOps for Vector { fn from_row( column_name: impl Into, row: &AnyRow, ) -> Result { let column: String = column_name.into(); - let vector = row.try_get::(&column).map_err(|_| { - let code = ErrorCode::InvalidVector; - let message = "Unable to get vector from the row."; - Error::new(code, message) - })?; - - Ok(vector) - } -} + let value = row.try_get_raw::<&str>(&column)?; + let value_type = value.type_info().kind(); -impl<'r, DB: Database> Decode<'r, DB> for RecordData -where - &'r str: Decode<'r, DB>, -{ - fn decode( - value: >::ValueRef, - ) -> Result> - { - let value = <&str as Decode>::decode(value)?; - Ok(RecordData::from(value)) - } -} + if value_type == SQLType::Null { + let code = ErrorCode::InvalidVector; + let message = "Vector must not be empty or null."; + return Err(Error::new(code, message)); + } -impl Type for RecordData -where - DB: Database, - &'static str: Type, -{ - fn type_info() -> DB::TypeInfo { - <&str as Type>::type_info() + match value_type { + SQLType::Text => { + let value = row.try_get::(&column)?; + let vector: Vec = serde_json::from_str(&value)?; + Ok(Vector::from(vector)) + } + SQLType::Blob => { + let value = row.try_get::, &str>(&column)?; + let vector: Vec = bincode::deserialize(&value)?; + Ok(Vector::from(vector)) + } + _ => { + let code = ErrorCode::InvalidVector; + let message = "Vector must be stored as JSON string or blob."; + Err(Error::new(code, message)) + } + } } } @@ -203,6 +175,44 @@ impl RowOps for Option { row: &AnyRow, ) -> Result { let column: String = column_name.into(); - Ok(row.try_get::(&column).unwrap_or_default()) + let value = row.try_get_raw::<&str>(&column)?; + let value_type = value.type_info().kind(); + + if value_type == SQLType::Null { + return Ok(None); + } + + if value_type.is_integer() { + let value: i64 = row.try_get::(&column)?; + return Ok(Some(RecordData::Integer(value as isize))); + } + + // Handle types other than null and integer below. + + let data = match value_type { + SQLType::Text => { + let value = row.try_get::(&column)?; + RecordData::String(value.to_string()) + } + SQLType::Bool => { + let value: bool = row.try_get::(&column)?; + RecordData::Boolean(value) + } + SQLType::Real => { + let value: f32 = row.try_get::(&column)?; + RecordData::Float(value) + } + SQLType::Double => { + let value: f64 = row.try_get::(&column)?; + RecordData::Float(value as f32) + } + _ => { + let code = ErrorCode::InvalidMetadata; + let message = "Unsupported type for OasysDB metadata."; + return Err(Error::new(code, message)); + } + }; + + Ok(Some(data)) } } From 323c74d457e3695a6300274b036af4fe912be55b Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sun, 14 Jul 2024 20:00:53 -0500 Subject: [PATCH 48/88] feat: add method to rebuild index --- src/db/database.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/db/database.rs b/src/db/database.rs index 4ba1575a..993a38a2 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -239,6 +239,26 @@ impl Database { index.search_with_filters(query.into(), k, filters.into()) } + /// Rebuilds the index from the existing records in the index. + /// - `name`: Index name. + /// + /// Some indexing algorithms may not support perfect incremental updates. + /// This method can be useful to rebalance the index. + pub fn rebuild_index( + &mut self, + name: impl AsRef, + ) -> Result<(), Error> { + let name = name.as_ref(); + let mut index = self.try_get_index(name)?; + index.refit()?; + + // Unwrap is safe here because we validated that the index exists above. + let IndexRef { algorithm, file } = self.get_index_ref(name).unwrap(); + algorithm.persist_index(file, index)?; + + Ok(()) + } + /// Returns the state object of the database. pub fn state(&self) -> &DatabaseState { &self.state @@ -406,6 +426,15 @@ mod tests { assert_eq!(results[0].id, RecordID(51)); } + #[test] + fn test_database_rebuild_index() { + let mut db = create_test_database().unwrap(); + db.rebuild_index(TEST_INDEX).unwrap(); + + let index = db.get_index(TEST_INDEX).unwrap(); + assert_eq!(index.metadata().count, 100); + } + fn create_test_database() -> Result { let path = PathBuf::from("odb_data"); if path.try_exists().is_ok() { From fc94474af1a6c56a755a8bcb2bbeedb682aca6d2 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 15 Jul 2024 14:07:10 -0500 Subject: [PATCH 49/88] feat: add delete index method --- src/db/database.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/db/database.rs b/src/db/database.rs index 993a38a2..b2dd323d 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -259,6 +259,19 @@ impl Database { Ok(()) } + /// Deletes an index from the database given its name. + pub fn delete_index(&mut self, name: impl AsRef) -> Result<(), Error> { + let name = name.as_ref(); + let index_ref = self.state.indices.remove(name).ok_or_else(|| { + let code = ErrorCode::NotFound; + let message = format!("Index doesn't exist: {name}."); + Error::new(code, message) + })?; + + fs::remove_file(index_ref.file())?; + file::write_binary_file(self.state_file(), &self.state) + } + /// Returns the state object of the database. pub fn state(&self) -> &DatabaseState { &self.state @@ -435,6 +448,15 @@ mod tests { assert_eq!(index.metadata().count, 100); } + #[test] + fn test_database_delete_index() { + let mut db = create_test_database().unwrap(); + db.delete_index(TEST_INDEX).unwrap(); + + let state = db.state(); + assert!(!state.indices.contains_key(TEST_INDEX)); + } + fn create_test_database() -> Result { let path = PathBuf::from("odb_data"); if path.try_exists().is_ok() { From 020f24514c71c125243fc61180236e7b7565bc55 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 15 Jul 2024 19:51:43 -0500 Subject: [PATCH 50/88] feat: change db state with mutex --- src/db/database.rs | 105 ++++++++++++++++++++++++++++----------------- src/types/err.rs | 11 ++++- 2 files changed, 76 insertions(+), 40 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index b2dd323d..42fdabc4 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -4,6 +4,7 @@ use futures::stream::StreamExt; use sqlx::any::install_default_drivers; use sqlx::Acquire; use sqlx::Executor; +use std::sync::Mutex; use url::Url; use uuid::Uuid; @@ -11,11 +12,14 @@ type DatabaseURL = String; type IndexName = String; type IndexFile = PathBuf; +/// A pool of indices loaded to the database memory. +type IndicesPool = HashMap>; + /// The vector database interface. pub struct Database { root: PathBuf, - state: DatabaseState, - conn: SourceConnection, + state: Mutex, + pool: IndicesPool, } impl Database { @@ -61,8 +65,10 @@ impl Database { state }; - let conn: SourceConnection = state.connect()?; - Ok(Self { root: root_dir, state, conn }) + state.validate_connection()?; + let state = Mutex::new(state); + let pool: IndicesPool = HashMap::new(); + Ok(Self { root: root_dir, state, pool }) } /// Creates a new index in the database asynchronously. @@ -71,14 +77,12 @@ impl Database { /// - `metric`: Distance metric for the index. /// - `config`: Index data source configuration. pub async fn async_create_index( - &mut self, + &self, name: impl Into, algorithm: IndexAlgorithm, metric: DistanceMetric, config: SourceConfig, ) -> Result<(), Error> { - let state_file = self.state_file(); - // Create a new file where the index will be stored. let index_file = { let uuid = Uuid::new_v4().to_string(); @@ -86,8 +90,8 @@ impl Database { }; let query = config.to_query(); - let conn = self.conn.acquire().await?; - let mut stream = sqlx::query(&query).fetch(conn); + let mut conn = self.state()?.async_connect().await?; + let mut stream = sqlx::query(&query).fetch(conn.acquire().await?); let mut records = HashMap::new(); while let Some(row) = stream.next().await { @@ -104,15 +108,18 @@ impl Database { // Update db state with the new index. let index_ref = IndexRef { algorithm, file: index_file.clone() }; - self.state.indices.insert(name.into(), index_ref); - file::write_binary_file(&state_file, &self.state)?; + let mut state = self.state.lock()?; + state.indices.insert(name.into(), index_ref); + + drop(state); + self.persist_state()?; Ok(()) } /// Creates a new index in the database synchronously. pub fn create_index( - &mut self, + &self, name: impl Into, algorithm: IndexAlgorithm, metric: DistanceMetric, @@ -127,8 +134,10 @@ impl Database { /// /// This method is useful for deserializing and accessing /// the index directly from the file based on the algorithm type. - pub fn get_index_ref(&self, name: impl AsRef) -> Option<&IndexRef> { - self.state.indices.get(name.as_ref()) + pub fn get_index_ref(&self, name: impl AsRef) -> Option { + let state = self.state.lock().ok()?; + let index_ref = state.indices.get(name.as_ref())?; + Some(index_ref.to_owned()) } /// Retrieves an index from the file and returns it as a trait object. @@ -163,7 +172,7 @@ impl Database { /// the source database for new records after that checkpoint. It then /// updates the index with the new records. pub async fn async_refresh_index( - &mut self, + &self, name: impl AsRef, ) -> Result<(), Error> { let name = name.as_ref(); @@ -186,8 +195,8 @@ impl Database { index.config().to_query_after(&checkpoint) }; - let conn = self.conn.acquire().await?; - let mut stream = sqlx::query(&query).fetch(conn); + let mut conn = self.state()?.async_connect().await?; + let mut stream = sqlx::query(&query).fetch(conn.acquire().await?); let mut records = HashMap::new(); while let Some(row) = stream.next().await { @@ -202,10 +211,7 @@ impl Database { } /// Updates the index with new records from the source synchronously. - pub fn refresh_index( - &mut self, - name: impl AsRef, - ) -> Result<(), Error> { + pub fn refresh_index(&self, name: impl AsRef) -> Result<(), Error> { executor::block_on(self.async_refresh_index(name)) } @@ -244,10 +250,7 @@ impl Database { /// /// Some indexing algorithms may not support perfect incremental updates. /// This method can be useful to rebalance the index. - pub fn rebuild_index( - &mut self, - name: impl AsRef, - ) -> Result<(), Error> { + pub fn rebuild_index(&self, name: impl AsRef) -> Result<(), Error> { let name = name.as_ref(); let mut index = self.try_get_index(name)?; index.refit()?; @@ -260,26 +263,34 @@ impl Database { } /// Deletes an index from the database given its name. - pub fn delete_index(&mut self, name: impl AsRef) -> Result<(), Error> { + pub fn delete_index(&self, name: impl AsRef) -> Result<(), Error> { let name = name.as_ref(); - let index_ref = self.state.indices.remove(name).ok_or_else(|| { + let mut state = self.state.lock()?; + let index_ref = state.indices.remove(name).ok_or_else(|| { let code = ErrorCode::NotFound; let message = format!("Index doesn't exist: {name}."); Error::new(code, message) })?; + drop(state); fs::remove_file(index_ref.file())?; - file::write_binary_file(self.state_file(), &self.state) + self.persist_state() } /// Returns the state object of the database. - pub fn state(&self) -> &DatabaseState { - &self.state + pub fn state(&self) -> Result { + let state = self.state.lock()?; + Ok(state.clone()) } /// Persists the state of the database to the state file. + /// + /// This method requires a Mutex lock to be available. + /// If the lock is not available, this method will be suspended. + /// When running this method with other state lock, drop + /// the lock before calling this method. pub fn persist_state(&self) -> Result<(), Error> { - file::write_binary_file(self.state_file(), &self.state) + file::write_binary_file(self.state_file(), &self.state()?) } } @@ -295,17 +306,17 @@ impl Database { #[allow(dead_code)] async fn async_execute_sql( - &mut self, + &self, query: impl AsRef, ) -> Result<(), Error> { - let conn = self.conn.acquire().await?; + let mut conn = self.state()?.async_connect().await?; conn.execute(query.as_ref()).await?; Ok(()) } } /// The state of the vector database. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct DatabaseState { source: DatabaseURL, indices: HashMap, @@ -323,6 +334,22 @@ impl DatabaseState { executor::block_on(self.async_connect()) } + /// Disconnects from the source SQL database asynchronously. + pub async fn async_disconnect(conn: SourceConnection) -> Result<(), Error> { + Ok(conn.close().await?) + } + + /// Disconnects from the source SQL database. + pub fn disconnect(conn: SourceConnection) -> Result<(), Error> { + executor::block_on(Self::async_disconnect(conn)) + } + + /// Validates the connection to the source database successful. + pub fn validate_connection(&self) -> Result<(), Error> { + let conn = self.connect()?; + DatabaseState::disconnect(conn) + } + /// Returns the type of the source database. /// - sqlite /// - mysql @@ -402,7 +429,7 @@ mod tests { #[test] fn test_database_refresh_index() { - let mut db = create_test_database().unwrap(); + let db = create_test_database().unwrap(); let query = generate_insert_query(100, 10); executor::block_on(db.async_execute_sql(query)).unwrap(); @@ -441,7 +468,7 @@ mod tests { #[test] fn test_database_rebuild_index() { - let mut db = create_test_database().unwrap(); + let db = create_test_database().unwrap(); db.rebuild_index(TEST_INDEX).unwrap(); let index = db.get_index(TEST_INDEX).unwrap(); @@ -450,10 +477,10 @@ mod tests { #[test] fn test_database_delete_index() { - let mut db = create_test_database().unwrap(); + let db = create_test_database().unwrap(); db.delete_index(TEST_INDEX).unwrap(); - let state = db.state(); + let state = db.state().unwrap(); assert!(!state.indices.contains_key(TEST_INDEX)); } @@ -467,7 +494,7 @@ mod tests { let db_url = format!("sqlite://{}?mode=rwc", db_path.display()); let mut db = Database::open(path, Some(db_url.clone()))?; - let state = db.state(); + let state = db.state()?; assert_eq!(state.source_type(), SourceType::SQLITE); executor::block_on(setup_test_source(db_url))?; diff --git a/src/types/err.rs b/src/types/err.rs index 76321e80..aa408156 100644 --- a/src/types/err.rs +++ b/src/types/err.rs @@ -6,6 +6,7 @@ use serde_json::Error as JSONError; use sqlx::Error as SQLError; use std::error::Error as StandardError; use std::io::Error as IOError; +use std::sync::PoisonError; #[allow(missing_docs)] #[derive(Debug, Eq, PartialEq)] @@ -16,14 +17,15 @@ pub enum ErrorCode { // Data type related. InvalidID, - InvalidVector, InvalidMetadata, + InvalidVector, // Other generic errors. InternalError, NotFound, // External error types. + ConcurrencyError, FileError, SerializationError, SQLError, @@ -82,3 +84,10 @@ impl From for Error { Error::new(code, err.to_string()) } } + +impl From> for Error { + fn from(err: PoisonError) -> Self { + let code = ErrorCode::ConcurrencyError; + Error::new(code, err.to_string()) + } +} From 8a809777463bff6a4dadcde9bc4191890b88ad45 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Tue, 16 Jul 2024 15:51:17 -0500 Subject: [PATCH 51/88] feat: add indices pool to db --- src/db/database.rs | 217 ++++++++++++++++++++++++---------- src/indices/mod.rs | 33 ++++-- src/indices/type_algorithm.rs | 4 +- src/indices/type_filter.rs | 10 +- src/types/record.rs | 19 ++- 5 files changed, 195 insertions(+), 88 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index 42fdabc4..1aa25195 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -3,8 +3,7 @@ use futures::executor; use futures::stream::StreamExt; use sqlx::any::install_default_drivers; use sqlx::Acquire; -use sqlx::Executor; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use url::Url; use uuid::Uuid; @@ -12,8 +11,8 @@ type DatabaseURL = String; type IndexName = String; type IndexFile = PathBuf; -/// A pool of indices loaded to the database memory. -type IndicesPool = HashMap>; +type Index = Arc>>; +type IndicesPool = Mutex>; /// The vector database interface. pub struct Database { @@ -39,7 +38,7 @@ impl Database { /// ``` pub fn open( root: impl Into, - source_url: Option>, + source_url: Option>, ) -> Result { let root_dir: PathBuf = root.into(); let indices_dir = root_dir.join("indices"); @@ -57,7 +56,7 @@ impl Database { Error::new(code, message) })?; - let source: String = source.into(); + let source = source.into(); DatabaseState::validate_source(&source)?; let state = DatabaseState { source, indices: HashMap::new() }; @@ -67,7 +66,7 @@ impl Database { state.validate_connection()?; let state = Mutex::new(state); - let pool: IndicesPool = HashMap::new(); + let pool: IndicesPool = Mutex::new(HashMap::new()); Ok(Self { root: root_dir, state, pool }) } @@ -78,7 +77,7 @@ impl Database { /// - `config`: Index data source configuration. pub async fn async_create_index( &self, - name: impl Into, + name: impl Into, algorithm: IndexAlgorithm, metric: DistanceMetric, config: SourceConfig, @@ -104,12 +103,16 @@ impl Database { index.fit(records)?; // Persist the index to the file. - algorithm.persist_index(&index_file, index)?; + algorithm.persist_index(&index_file, index.as_ref())?; + + let index_name: IndexName = name.into(); + let mut pool = self.pool.lock()?; + pool.insert(index_name.clone(), Arc::new(Mutex::new(index))); // Update db state with the new index. - let index_ref = IndexRef { algorithm, file: index_file.clone() }; + let index_ref = IndexRef { algorithm, file: index_file }; let mut state = self.state.lock()?; - state.indices.insert(name.into(), index_ref); + state.indices.insert(index_name, index_ref); drop(state); self.persist_state()?; @@ -120,7 +123,7 @@ impl Database { /// Creates a new index in the database synchronously. pub fn create_index( &self, - name: impl Into, + name: impl Into, algorithm: IndexAlgorithm, metric: DistanceMetric, config: SourceConfig, @@ -140,23 +143,31 @@ impl Database { Some(index_ref.to_owned()) } - /// Retrieves an index from the file and returns it as a trait object. + /// Retrieves an index and returns it as a trait object. /// - `name`: Index name. - pub fn get_index( - &self, - name: impl AsRef, - ) -> Option> { + /// + /// This method will return the index from the pool if it exists. + /// Otherwise, it will load the index from the file and store it + /// in the pool for future access. + pub fn get_index(&self, name: impl AsRef) -> Option { + let name = name.as_ref(); let IndexRef { algorithm, file } = self.get_index_ref(name)?; - algorithm.load_index(file).ok() + + let mut pool = self.pool.lock().ok()?; + if let Some(index) = pool.get(name).cloned() { + return Some(index); + } + + let index = algorithm.load_index(file).ok()?; + let index: Index = Arc::new(Mutex::new(index)); + pool.insert(name.into(), index.clone()); + Some(index) } - /// Retrieves an index from the file and if found, returns it as a - /// trait object. Otherwise, returns a not found error. + /// Retrieves an index and if found, returns it as a trait object. + /// Otherwise, returns a not found error. /// - `name`: Index name. - pub fn try_get_index( - &self, - name: impl AsRef, - ) -> Result, Error> { + pub fn try_get_index(&self, name: impl AsRef) -> Result { let name = name.as_ref(); self.get_index(name).ok_or_else(|| { let code = ErrorCode::NotFound; @@ -187,12 +198,15 @@ impl Database { // It's safe to unwrap here because we validated that index exists by // calling get_index_ref method above. - let mut index = self.get_index(name).unwrap(); + let index: Index = self.get_index(name).unwrap(); - let query = { + let (config, query) = { + let index = index.lock()?; let meta = index.metadata(); + let config = index.config(); + let checkpoint = meta.last_inserted.unwrap_or_default(); - index.config().to_query_after(&checkpoint) + (config.to_owned(), config.to_query_after(&checkpoint)) }; let mut conn = self.state()?.async_connect().await?; @@ -201,12 +215,13 @@ impl Database { let mut records = HashMap::new(); while let Some(row) = stream.next().await { let row = row?; - let (id, record) = index.config().to_record(&row)?; + let (id, record) = config.to_record(&row)?; records.insert(id, record); } + let mut index = index.lock()?; index.fit(records)?; - algorithm.persist_index(file, index)?; + algorithm.persist_index(file, index.as_ref())?; Ok(()) } @@ -225,7 +240,8 @@ impl Database { query: impl Into, k: usize, ) -> Result, Error> { - let index = self.try_get_index(name)?; + let index: Index = self.try_get_index(name)?; + let index = index.lock()?; index.search(query.into(), k) } @@ -241,7 +257,8 @@ impl Database { k: usize, filters: impl Into, ) -> Result, Error> { - let index = self.try_get_index(name)?; + let index: Index = self.try_get_index(name)?; + let index = index.lock()?; index.search_with_filters(query.into(), k, filters.into()) } @@ -252,12 +269,13 @@ impl Database { /// This method can be useful to rebalance the index. pub fn rebuild_index(&self, name: impl AsRef) -> Result<(), Error> { let name = name.as_ref(); - let mut index = self.try_get_index(name)?; + let index: Index = self.try_get_index(name)?; + let mut index = index.lock()?; index.refit()?; // Unwrap is safe here because we validated that the index exists above. let IndexRef { algorithm, file } = self.get_index_ref(name).unwrap(); - algorithm.persist_index(file, index)?; + algorithm.persist_index(file, index.as_ref())?; Ok(()) } @@ -277,10 +295,52 @@ impl Database { self.persist_state() } + /// Loads indices to the pool if they are not already loaded. + /// - `names`: Names of the indices. + pub fn load_indices( + &self, + names: Vec>, + ) -> Result<(), Error> { + let state = self.state()?; + if names.iter().any(|name| !state.indices.contains_key(name.as_ref())) { + let code = ErrorCode::NotFound; + let message = "Some indices are not found in the database."; + return Err(Error::new(code, message)); + } + + for name in names { + self.get_index(name); + } + + Ok(()) + } + + /// Releases indices from the pool. + /// - `names`: Names of the indices. + /// + /// This method can free up memory by removing indices from the pool. + /// After the indices are released, when they need to be accessed again, + /// they will be loaded from the file. + /// + /// Loading indices from the file might take some time. Therefore, + /// it's recommended to keep the frequently used indices in the pool. + pub fn release_indices( + &self, + names: Vec>, + ) -> Result<(), Error> { + for name in names { + let name = name.as_ref(); + let mut pool = self.pool.lock()?; + pool.remove(name); + } + + Ok(()) + } + /// Returns the state object of the database. pub fn state(&self) -> Result { let state = self.state.lock()?; - Ok(state.clone()) + Ok(state.to_owned()) } /// Persists the state of the database to the state file. @@ -303,16 +363,6 @@ impl Database { fn indices_dir(&self) -> PathBuf { self.root.join("indices") } - - #[allow(dead_code)] - async fn async_execute_sql( - &self, - query: impl AsRef, - ) -> Result<(), Error> { - let mut conn = self.state()?.async_connect().await?; - conn.execute(query.as_ref()).await?; - Ok(()) - } } /// The state of the vector database. @@ -362,8 +412,8 @@ impl DatabaseState { } /// Validates the data source URL. - pub fn validate_source(url: impl Into) -> Result<(), Error> { - let url: String = url.into(); + pub fn validate_source(url: impl Into) -> Result<(), Error> { + let url = url.into(); let url = url.parse::().map_err(|_| { let code = ErrorCode::InvalidSource; let message = "Invalid database source URL."; @@ -407,6 +457,7 @@ mod tests { use super::*; use crate::prelude::RecordID; use sqlx::{Executor, Row}; + use std::sync::MutexGuard; const TABLE: &str = "embeddings"; const TEST_INDEX: &str = "test_index"; @@ -417,29 +468,33 @@ mod tests { } #[test] - fn test_database_create_index() { - let db = create_test_database().unwrap(); + fn test_database_create_index() -> Result<(), Error> { + let db = create_test_database()?; - let index = db.get_index(TEST_INDEX).unwrap(); + let index: Index = db.try_get_index(TEST_INDEX)?; + let index = index.lock()?; let metadata = index.metadata(); assert_eq!(metadata.count, 100); assert_eq!(metadata.last_inserted, Some(RecordID(100))); + Ok(()) } #[test] - fn test_database_refresh_index() { - let db = create_test_database().unwrap(); + fn test_database_refresh_index() -> Result<(), Error> { + let db = create_test_database()?; let query = generate_insert_query(100, 10); - executor::block_on(db.async_execute_sql(query)).unwrap(); + executor::block_on(db.async_execute_sql(query))?; db.refresh_index(TEST_INDEX).unwrap(); - let index = db.get_index(TEST_INDEX).unwrap(); + let index: Index = db.try_get_index(TEST_INDEX)?; + let index = index.lock()?; let metadata = index.metadata(); assert_eq!(metadata.count, 110); assert_eq!(metadata.last_inserted, Some(RecordID(110))); + Ok(()) } #[test] @@ -467,12 +522,14 @@ mod tests { } #[test] - fn test_database_rebuild_index() { - let db = create_test_database().unwrap(); - db.rebuild_index(TEST_INDEX).unwrap(); + fn test_database_rebuild_index() -> Result<(), Error> { + let db = create_test_database()?; + db.rebuild_index(TEST_INDEX)?; - let index = db.get_index(TEST_INDEX).unwrap(); + let index: Index = db.try_get_index(TEST_INDEX)?; + let index = index.lock()?; assert_eq!(index.metadata().count, 100); + Ok(()) } #[test] @@ -484,20 +541,39 @@ mod tests { assert!(!state.indices.contains_key(TEST_INDEX)); } + #[test] + fn test_database_indices_pool() -> Result<(), Error> { + let db = create_test_database()?; + + { + db.release_indices(vec![TEST_INDEX])?; + let pool = db.pool()?; + assert!(!pool.contains_key(TEST_INDEX)); + } + + { + db.load_indices(vec![TEST_INDEX])?; + let pool = db.pool()?; + assert!(pool.contains_key(TEST_INDEX)); + } + + Ok(()) + } + fn create_test_database() -> Result { let path = PathBuf::from("odb_data"); - if path.try_exists().is_ok() { + if path.try_exists()? { fs::remove_dir_all(&path)?; } let db_path = file::get_tmp_dir()?.join("sqlite.db"); let db_url = format!("sqlite://{}?mode=rwc", db_path.display()); - let mut db = Database::open(path, Some(db_url.clone()))?; + let mut db = Database::open(path, Some(db_url.to_owned()))?; let state = db.state()?; assert_eq!(state.source_type(), SourceType::SQLITE); - executor::block_on(setup_test_source(db_url))?; + executor::block_on(setup_test_source(&db_url))?; create_test_index(&mut db)?; Ok(db) } @@ -537,8 +613,10 @@ mod tests { ) } - async fn setup_test_source(url: impl Into) -> Result<(), Error> { - let url: String = url.into(); + async fn setup_test_source( + url: impl Into, + ) -> Result<(), Error> { + let url = url.into(); let mut conn = SourceConnection::connect(&url).await?; let create_table = format!( @@ -562,4 +640,19 @@ mod tests { assert_eq!(count, 100); Ok(()) } + + impl Database { + fn pool(&self) -> Result>, Error> { + Ok(self.pool.lock()?) + } + + async fn async_execute_sql( + &self, + query: impl AsRef, + ) -> Result<(), Error> { + let mut conn = self.state()?.async_connect().await?; + conn.execute(query.as_ref()).await?; + Ok(()) + } + } } diff --git a/src/indices/mod.rs b/src/indices/mod.rs index f498d9cc..01dfc9dc 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -43,7 +43,7 @@ impl From<&str> for SourceType { } /// Data source configuration for a vector index. -#[derive(Debug, Serialize, Deserialize, Default)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct SourceConfig { /// Name of the SQL table to use as data source. pub table: TableName, @@ -57,6 +57,18 @@ pub struct SourceConfig { pub filter: Option, } +impl Default for SourceConfig { + fn default() -> Self { + SourceConfig { + table: "table".into(), + primary_key: "id".into(), + vector: "vector".into(), + metadata: None, + filter: None, + } + } +} + impl SourceConfig { /// Creates a source configuration with mostly default values. /// - `primary_key`: Column name of the primary key in the data source. @@ -66,9 +78,9 @@ impl SourceConfig { /// - No metadata columns. /// - No query filter. pub fn new( - table: impl Into, - primary_key: impl Into, - vector: impl Into, + table: impl Into, + primary_key: impl Into, + vector: impl Into, ) -> Self { SourceConfig { table: table.into(), @@ -87,7 +99,10 @@ impl SourceConfig { /// - Integer /// - Float /// - Boolean - pub fn with_metadata(mut self, metadata: Vec>) -> Self { + pub fn with_metadata( + mut self, + metadata: Vec>, + ) -> Self { self.metadata = Some(metadata.into_iter().map(|s| s.into()).collect()); self } @@ -112,7 +127,7 @@ impl SourceConfig { columns.extend(metadata.iter()); } - columns.into_iter().map(|s| s.to_string()).collect() + columns.into_iter().map(|s| s.to_owned()).collect() } /// Generates a SQL query string based on the configuration. @@ -163,7 +178,7 @@ impl SourceConfig { let mut metadata = HashMap::new(); if let Some(metadata_columns) = &self.metadata { for column in metadata_columns { - let value = RowOps::from_row(column, row)?; + let value = RowOps::from_row(column.to_owned(), row)?; metadata.insert(column.to_owned(), value); } } @@ -250,7 +265,7 @@ pub trait IndexOps: Debug + Serialize + DeserializeOwned { /// // Other fields... /// } /// ``` -pub trait VectorIndex: Debug { +pub trait VectorIndex: Debug + Send + Sync { /// Returns the configuration of the index. fn config(&self) -> &SourceConfig; @@ -337,7 +352,7 @@ mod index_tests { let id = RecordID(i as u32); let vector = Vector::from(vec![i as f32; 128]); let data = HashMap::from([( - "number".to_string(), + "number".into(), Some(RecordData::Integer(1000 + i)), )]); diff --git a/src/indices/type_algorithm.rs b/src/indices/type_algorithm.rs index 31fa3d13..eac524cd 100644 --- a/src/indices/type_algorithm.rs +++ b/src/indices/type_algorithm.rs @@ -40,7 +40,7 @@ impl IndexAlgorithm { pub(crate) fn persist_index( &self, path: impl AsRef, - index: Box, + index: &dyn VectorIndex, ) -> Result<(), Error> { match self { IndexAlgorithm::BruteForce => { @@ -58,7 +58,7 @@ impl IndexAlgorithm { fn _persist_index( path: impl AsRef, - index: Box, + index: &dyn VectorIndex, ) -> Result<(), Error> { let index = index.as_any().downcast_ref::().ok_or_else(|| { let code = ErrorCode::InternalError; diff --git a/src/indices/type_filter.rs b/src/indices/type_filter.rs index c6a5a000..1e9302d5 100644 --- a/src/indices/type_filter.rs +++ b/src/indices/type_filter.rs @@ -124,7 +124,7 @@ impl From<&str> for Filter { let parts: Vec<&str> = value.splitn(3, ' ').collect(); let parts: Vec<&str> = parts.into_iter().map(|p| p.trim()).collect(); - let column = parts[0].to_string(); + let column = parts[0].into(); let operator = FilterOperator::from(parts[1]); let value = RecordData::from(parts[2]); Filter { column, value, operator } @@ -178,7 +178,7 @@ mod tests { let mut data = HashMap::new(); for (column, value) in columns.into_iter().zip(values.into_iter()) { - data.insert(column.to_string(), Some(value)); + data.insert(column.into(), Some(value)); } data @@ -188,7 +188,7 @@ mod tests { fn test_filters_from_string() { let filters = Filters::from("name CONTAINS Ada"); let expected = Filters::AND(vec![Filter { - column: "name".to_string(), + column: "name".into(), value: "Ada".into(), operator: FilterOperator::Contain, }]); @@ -198,13 +198,13 @@ mod tests { let filters = Filters::from("gpa >= 3.0 OR age < 21"); let expected = { let filter_gpa = Filter { - column: "gpa".to_string(), + column: "gpa".into(), value: RecordData::Float(3.0), operator: FilterOperator::GreaterThanOrEqual, }; let filter_age = Filter { - column: "age".to_string(), + column: "age".into(), value: RecordData::Integer(21), operator: FilterOperator::LessThan, }; diff --git a/src/types/record.rs b/src/types/record.rs index c59bc9a3..3d9f8ce9 100644 --- a/src/types/record.rs +++ b/src/types/record.rs @@ -10,7 +10,6 @@ use std::collections::HashMap; pub type ColumnName = String; /// ID type for records in the index from the data source. -#[allow(clippy::upper_case_acronyms)] #[derive(Debug, Clone, Copy, Serialize, Deserialize)] #[derive(Hash, PartialEq, Eq, PartialOrd, Ord, Default)] pub struct RecordID(pub u32); @@ -27,12 +26,12 @@ pub struct Record { #[derive(Debug, Clone, Serialize, Deserialize)] /// Vector data type stored in the index. -pub struct Vector(pub Vec); +pub struct Vector(pub Box<[f16]>); impl Vector { /// Returns the vector data as a vector of f32. pub fn to_vec(&self) -> Vec { - self.0.clone().into_iter().map(f16::to_f32).collect() + self.0.iter().map(|v| v.to_f32()).collect() } /// Returns the dimension of the vector. @@ -111,7 +110,7 @@ impl From for RecordData { pub(crate) trait RowOps { /// Retrieves data from the row based on the column name. fn from_row( - column_name: impl Into, + column_name: impl Into, row: &AnyRow, ) -> Result where @@ -120,10 +119,10 @@ pub(crate) trait RowOps { impl RowOps for RecordID { fn from_row( - column_name: impl Into, + column_name: impl Into, row: &AnyRow, ) -> Result { - let column_name: String = column_name.into(); + let column_name = column_name.into(); let id = row.try_get::(&column_name).map_err(|_| { let code = ErrorCode::InvalidID; let message = "Unable to get integer ID from the row."; @@ -136,10 +135,10 @@ impl RowOps for RecordID { impl RowOps for Vector { fn from_row( - column_name: impl Into, + column_name: impl Into, row: &AnyRow, ) -> Result { - let column: String = column_name.into(); + let column = column_name.into(); let value = row.try_get_raw::<&str>(&column)?; let value_type = value.type_info().kind(); @@ -171,10 +170,10 @@ impl RowOps for Vector { impl RowOps for Option { fn from_row( - column_name: impl Into, + column_name: impl Into, row: &AnyRow, ) -> Result { - let column: String = column_name.into(); + let column = column_name.into(); let value = row.try_get_raw::<&str>(&column)?; let value_type = value.type_info().kind(); From 26b31fae43bd3cf96639069abe0f1e5102102708 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Tue, 16 Jul 2024 19:00:48 -0500 Subject: [PATCH 52/88] refactor: improve file structure and organization --- src/db/mod.rs | 4 +- .../{ix_bruteforce.rs => idx_bruteforce.rs} | 0 src/indices/mod.rs | 90 ++++++++++++++++--- src/indices/type_algorithm.rs | 72 --------------- src/lib.rs | 2 + src/types/conn.rs | 2 - .../type_distance.rs => types/distance.rs} | 3 +- .../type_filter.rs => types/filter.rs} | 3 +- src/types/mod.rs | 9 +- src/{types => utils}/file.rs | 2 +- src/utils/mod.rs | 1 + 11 files changed, 94 insertions(+), 94 deletions(-) rename src/indices/{ix_bruteforce.rs => idx_bruteforce.rs} (100%) delete mode 100644 src/indices/type_algorithm.rs delete mode 100644 src/types/conn.rs rename src/{indices/type_distance.rs => types/distance.rs} (95%) rename src/{indices/type_filter.rs => types/filter.rs} (99%) rename src/{types => utils}/file.rs (97%) create mode 100644 src/utils/mod.rs diff --git a/src/db/mod.rs b/src/db/mod.rs index fc90ec40..26601801 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,9 +1,9 @@ use crate::indices::*; -use crate::types::conn::*; use crate::types::err::*; -use crate::types::file; use crate::types::record::Vector; +use crate::utils::file; use serde::{Deserialize, Serialize}; +use sqlx::{AnyConnection as SourceConnection, Connection}; use std::collections::HashMap; use std::fs; use std::path::PathBuf; diff --git a/src/indices/ix_bruteforce.rs b/src/indices/idx_bruteforce.rs similarity index 100% rename from src/indices/ix_bruteforce.rs rename to src/indices/idx_bruteforce.rs diff --git a/src/indices/mod.rs b/src/indices/mod.rs index 01dfc9dc..38e81df5 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -1,6 +1,6 @@ use crate::types::err::*; -use crate::types::file; use crate::types::record::*; +use crate::utils::file; use rayon::prelude::*; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; @@ -10,15 +10,12 @@ use std::collections::HashMap; use std::fmt::Debug; use std::path::Path; -mod ix_bruteforce; -mod type_algorithm; -mod type_distance; -mod type_filter; +mod idx_bruteforce; -pub use ix_bruteforce::IndexBruteForce; -pub use type_algorithm::IndexAlgorithm; -pub use type_distance::DistanceMetric; -pub use type_filter::*; +pub use idx_bruteforce::IndexBruteForce; + +pub use crate::types::distance::DistanceMetric; +pub use crate::types::filter::*; type TableName = String; @@ -188,6 +185,77 @@ impl SourceConfig { } } +/// Algorithm options used to index and search vectors. +#[allow(missing_docs)] +#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Serialize, Deserialize)] +pub enum IndexAlgorithm { + BruteForce, // -> IndexBruteForce +} + +impl IndexAlgorithm { + /// Initializes a new index based on the algorithm and configuration. + pub(crate) fn initialize( + &self, + config: SourceConfig, + metric: DistanceMetric, + ) -> Box { + let index = match self { + IndexAlgorithm::BruteForce => IndexBruteForce::new(config, metric), + }; + + Box::new(index) + } + + pub(crate) fn load_index( + &self, + path: impl AsRef, + ) -> Result, Error> { + match self { + IndexAlgorithm::BruteForce => { + let index = Self::_load_index::(path)?; + Ok(Box::new(index)) + } + } + } + + /// Persists the index to a file based on the algorithm. + /// - `path`: Path to the file where the index will be stored. + /// - `index`: Index to persist as a trait object. + pub(crate) fn persist_index( + &self, + path: impl AsRef, + index: &dyn VectorIndex, + ) -> Result<(), Error> { + match self { + IndexAlgorithm::BruteForce => { + Self::_persist_index::(path, index) + } + } + } + + fn _load_index( + path: impl AsRef, + ) -> Result { + let index = T::load(path)?; + Ok(index) + } + + fn _persist_index( + path: impl AsRef, + index: &dyn VectorIndex, + ) -> Result<(), Error> { + let index = index.as_any().downcast_ref::().ok_or_else(|| { + let code = ErrorCode::InternalError; + let message = "Failed to downcast index to concrete type."; + Error::new(code, message) + })?; + + index.persist(path)?; + Ok(()) + } +} + /// Metadata about the index for operations and optimizations. #[derive(Debug, Serialize, Deserialize, Default)] pub struct IndexMetadata { @@ -256,8 +324,10 @@ pub trait IndexOps: Debug + Serialize + DeserializeOwned { /// of this trait is required. Roughly, the index struct should look like: /// /// ```text +/// use super::*; +/// /// #[derive(Debug, Serialize, Deserialize)] -/// struct Index{{ Algorithm }} { +/// pub struct Index{{ Algorithm }} { /// config: SourceConfig, /// metric: DistanceMetric, /// metadata: IndexMetadata, diff --git a/src/indices/type_algorithm.rs b/src/indices/type_algorithm.rs deleted file mode 100644 index eac524cd..00000000 --- a/src/indices/type_algorithm.rs +++ /dev/null @@ -1,72 +0,0 @@ -use super::*; - -/// Algorithm options used to index and search vectors. -#[allow(missing_docs)] -#[derive(Debug, PartialEq, Eq, Clone)] -#[derive(Serialize, Deserialize)] -pub enum IndexAlgorithm { - BruteForce, // -> IndexBruteForce -} - -impl IndexAlgorithm { - /// Initializes a new index based on the algorithm and configuration. - pub(crate) fn initialize( - &self, - config: SourceConfig, - metric: DistanceMetric, - ) -> Box { - let index = match self { - IndexAlgorithm::BruteForce => IndexBruteForce::new(config, metric), - }; - - Box::new(index) - } - - pub(crate) fn load_index( - &self, - path: impl AsRef, - ) -> Result, Error> { - match self { - IndexAlgorithm::BruteForce => { - let index = Self::_load_index::(path)?; - Ok(Box::new(index)) - } - } - } - - /// Persists the index to a file based on the algorithm. - /// - `path`: Path to the file where the index will be stored. - /// - `index`: Index to persist as a trait object. - pub(crate) fn persist_index( - &self, - path: impl AsRef, - index: &dyn VectorIndex, - ) -> Result<(), Error> { - match self { - IndexAlgorithm::BruteForce => { - Self::_persist_index::(path, index) - } - } - } - - fn _load_index( - path: impl AsRef, - ) -> Result { - let index = T::load(path)?; - Ok(index) - } - - fn _persist_index( - path: impl AsRef, - index: &dyn VectorIndex, - ) -> Result<(), Error> { - let index = index.as_any().downcast_ref::().ok_or_else(|| { - let code = ErrorCode::InternalError; - let message = "Failed to downcast index to concrete type."; - Error::new(code, message) - })?; - - index.persist(path)?; - Ok(()) - } -} diff --git a/src/lib.rs b/src/lib.rs index 76cc96a1..1ba54a13 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,8 @@ #![doc(html_favicon_url = "https://i.postimg.cc/W3T230zk/favicon.png")] #![doc(html_logo_url = "https://i.postimg.cc/Vv0HPVwB/logo.png")] +pub(crate) mod utils; + /// Primary module for vector database operations. pub mod db; /// Module for managing database indices and related types. diff --git a/src/types/conn.rs b/src/types/conn.rs deleted file mode 100644 index efdd72c8..00000000 --- a/src/types/conn.rs +++ /dev/null @@ -1,2 +0,0 @@ -pub use sqlx::AnyConnection as SourceConnection; -pub use sqlx::Connection; diff --git a/src/indices/type_distance.rs b/src/types/distance.rs similarity index 95% rename from src/indices/type_distance.rs rename to src/types/distance.rs index 56df0787..1d32caa8 100644 --- a/src/indices/type_distance.rs +++ b/src/types/distance.rs @@ -1,4 +1,5 @@ -use super::*; +use crate::types::record::Vector; +use serde::{Deserialize, Serialize}; use simsimd::SpatialSimilarity; /// Distance metric used to compare vectors in the index. diff --git a/src/indices/type_filter.rs b/src/types/filter.rs similarity index 99% rename from src/indices/type_filter.rs rename to src/types/filter.rs index 1e9302d5..744dcf5a 100644 --- a/src/indices/type_filter.rs +++ b/src/types/filter.rs @@ -1,6 +1,7 @@ #![allow(missing_docs)] -use super::*; +use crate::types::record::*; +use std::collections::HashMap; /// Joined multiple filters operation with either AND or OR. /// diff --git a/src/types/mod.rs b/src/types/mod.rs index 421c86e9..fa0b6b83 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,9 +1,8 @@ +/// Vector indexing distance metrics. +pub mod distance; /// Custom error types of OasysDB. pub mod err; +/// Native index filtering implementation. +pub mod filter; /// Vector record types for indices. pub mod record; - -/// SQL database connection types. -pub(crate) mod conn; -/// File operation utilities. -pub(crate) mod file; diff --git a/src/types/file.rs b/src/utils/file.rs similarity index 97% rename from src/types/file.rs rename to src/utils/file.rs index 82016e67..b4a026e6 100644 --- a/src/types/file.rs +++ b/src/utils/file.rs @@ -1,4 +1,4 @@ -use super::err::{Error, ErrorCode}; +use crate::types::err::{Error, ErrorCode}; use serde::de::DeserializeOwned; use serde::Serialize; use std::env; diff --git a/src/utils/mod.rs b/src/utils/mod.rs new file mode 100644 index 00000000..2e172cd0 --- /dev/null +++ b/src/utils/mod.rs @@ -0,0 +1 @@ +pub mod file; From 599d74f7cad7e8c48edad6a8d81d65a28bee707d Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Tue, 16 Jul 2024 21:39:23 -0500 Subject: [PATCH 53/88] refactor: rename bruteforce index to flat index --- src/db/database.rs | 4 ++-- src/indices/{idx_bruteforce.rs => idx_flat.rs} | 16 ++++++++-------- src/indices/mod.rs | 16 ++++++++-------- 3 files changed, 18 insertions(+), 18 deletions(-) rename src/indices/{idx_bruteforce.rs => idx_flat.rs} (90%) diff --git a/src/db/database.rs b/src/db/database.rs index 1aa25195..9355ad2e 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -584,13 +584,13 @@ mod tests { db.create_index( TEST_INDEX, - IndexAlgorithm::BruteForce, + IndexAlgorithm::Flat, DistanceMetric::Euclidean, config, )?; let index_ref = db.get_index_ref(TEST_INDEX).unwrap(); - assert_eq!(index_ref.algorithm(), &IndexAlgorithm::BruteForce); + assert_eq!(index_ref.algorithm(), &IndexAlgorithm::Flat); Ok(()) } diff --git a/src/indices/idx_bruteforce.rs b/src/indices/idx_flat.rs similarity index 90% rename from src/indices/idx_bruteforce.rs rename to src/indices/idx_flat.rs index c417272e..4a46f9e4 100644 --- a/src/indices/idx_bruteforce.rs +++ b/src/indices/idx_flat.rs @@ -1,20 +1,20 @@ use super::*; use std::collections::BinaryHeap; -/// Brute force index implementation. +/// Flat index implementation. /// /// This index stores all records in memory and performs a linear search /// for the nearest neighbors. It is great for small datasets of less than /// 10,000 records due to perfect recall and precision. #[derive(Debug, Serialize, Deserialize)] -pub struct IndexBruteForce { +pub struct IndexFlat { config: SourceConfig, metric: DistanceMetric, metadata: IndexMetadata, data: HashMap, } -impl IndexOps for IndexBruteForce { +impl IndexOps for IndexFlat { fn new(config: SourceConfig, metric: DistanceMetric) -> Self { Self { config, @@ -25,7 +25,7 @@ impl IndexOps for IndexBruteForce { } } -impl VectorIndex for IndexBruteForce { +impl VectorIndex for IndexFlat { fn config(&self) -> &SourceConfig { &self.config } @@ -50,8 +50,8 @@ impl VectorIndex for IndexBruteForce { Ok(()) } - /// Refitting doesn't do anything for the brute force index - /// as incremental insertion or deletion will directly update + /// Refitting doesn't do anything for the flat index as + /// incremental insertion or deletion will directly update /// the data store accordingly guaranteeing the index optimal state. fn refit(&mut self) -> Result<(), Error> { Ok(()) @@ -124,10 +124,10 @@ mod tests { use super::*; #[test] - fn test_bruteforce_index() { + fn test_flat_index() { let config = SourceConfig::default(); let metric = DistanceMetric::Euclidean; - let mut index = IndexBruteForce::new(config, metric); + let mut index = IndexFlat::new(config, metric); index_tests::populate_index(&mut index); index_tests::test_search(&index); index_tests::test_search_with_filters(&index); diff --git a/src/indices/mod.rs b/src/indices/mod.rs index 38e81df5..022b3828 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -10,9 +10,9 @@ use std::collections::HashMap; use std::fmt::Debug; use std::path::Path; -mod idx_bruteforce; +mod idx_flat; -pub use idx_bruteforce::IndexBruteForce; +pub use idx_flat::IndexFlat; pub use crate::types::distance::DistanceMetric; pub use crate::types::filter::*; @@ -190,7 +190,7 @@ impl SourceConfig { #[derive(Debug, PartialEq, Eq, Clone)] #[derive(Serialize, Deserialize)] pub enum IndexAlgorithm { - BruteForce, // -> IndexBruteForce + Flat, // -> IndexFlat } impl IndexAlgorithm { @@ -201,7 +201,7 @@ impl IndexAlgorithm { metric: DistanceMetric, ) -> Box { let index = match self { - IndexAlgorithm::BruteForce => IndexBruteForce::new(config, metric), + IndexAlgorithm::Flat => IndexFlat::new(config, metric), }; Box::new(index) @@ -212,8 +212,8 @@ impl IndexAlgorithm { path: impl AsRef, ) -> Result, Error> { match self { - IndexAlgorithm::BruteForce => { - let index = Self::_load_index::(path)?; + IndexAlgorithm::Flat => { + let index = Self::_load_index::(path)?; Ok(Box::new(index)) } } @@ -228,8 +228,8 @@ impl IndexAlgorithm { index: &dyn VectorIndex, ) -> Result<(), Error> { match self { - IndexAlgorithm::BruteForce => { - Self::_persist_index::(path, index) + IndexAlgorithm::Flat => { + Self::_persist_index::(path, index) } } } From 8144663beeadc53f157365f7b73e1b6cd1e69f43 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Thu, 18 Jul 2024 12:48:49 -0500 Subject: [PATCH 54/88] chore: add crate random --- Cargo.lock | 1 + Cargo.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index c43c6fb6..e983c17b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -641,6 +641,7 @@ dependencies = [ "bincode", "futures", "half", + "rand", "rayon", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index b53873f9..1d447df8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ uuid = { version = "1.9.1", features = ["v4", "fast-rng", "serde"] } half = { version = "2.4.1", features = ["serde"] } url = "2.5.2" futures = "0.3.30" +rand = "0.8.5" # Parallelism. rayon = "1.10.0" From c26483de9182285408ac8a9aa6ef728ccc03c650 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Thu, 18 Jul 2024 13:03:30 -0500 Subject: [PATCH 55/88] feat: add kmeans and params refactor --- src/db/database.rs | 19 ++---- src/db/mod.rs | 3 +- src/indices/idx_flat.rs | 68 ++++++++++++++----- src/indices/mod.rs | 104 ++++++++++++++++------------- src/prelude/mod.rs | 4 +- src/types/distance.rs | 16 ++++- src/types/err.rs | 11 ++++ src/types/filter.rs | 30 ++++----- src/types/record.rs | 66 +++++++++++++------ src/utils/kmeans.rs | 142 ++++++++++++++++++++++++++++++++++++++++ src/utils/mod.rs | 1 + 11 files changed, 345 insertions(+), 119 deletions(-) create mode 100644 src/utils/kmeans.rs diff --git a/src/db/database.rs b/src/db/database.rs index 9355ad2e..3320bbe8 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -73,13 +73,11 @@ impl Database { /// Creates a new index in the database asynchronously. /// - `name`: Name of the index. /// - `algorithm`: Indexing algorithm to use. - /// - `metric`: Distance metric for the index. /// - `config`: Index data source configuration. pub async fn async_create_index( &self, name: impl Into, algorithm: IndexAlgorithm, - metric: DistanceMetric, config: SourceConfig, ) -> Result<(), Error> { // Create a new file where the index will be stored. @@ -99,7 +97,7 @@ impl Database { records.insert(id, record); } - let mut index = algorithm.initialize(config, metric); + let mut index = algorithm.initialize(config)?; index.fit(records)?; // Persist the index to the file. @@ -125,12 +123,9 @@ impl Database { &self, name: impl Into, algorithm: IndexAlgorithm, - metric: DistanceMetric, config: SourceConfig, ) -> Result<(), Error> { - executor::block_on( - self.async_create_index(name, algorithm, metric, config), - ) + executor::block_on(self.async_create_index(name, algorithm, config)) } /// Returns an index reference by name. @@ -579,18 +574,14 @@ mod tests { } fn create_test_index(db: &mut Database) -> Result<(), Error> { + let algorithm = IndexAlgorithm::Flat(ParamsFlat::default()); let config = SourceConfig::new(TABLE, "id", "vector") .with_metadata(vec!["data"]); - db.create_index( - TEST_INDEX, - IndexAlgorithm::Flat, - DistanceMetric::Euclidean, - config, - )?; + db.create_index(TEST_INDEX, algorithm, config)?; let index_ref = db.get_index_ref(TEST_INDEX).unwrap(); - assert_eq!(index_ref.algorithm(), &IndexAlgorithm::Flat); + assert_eq!(index_ref.algorithm().name(), "FLAT"); Ok(()) } diff --git a/src/db/mod.rs b/src/db/mod.rs index 26601801..3e1223cf 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -1,5 +1,6 @@ use crate::indices::*; -use crate::types::err::*; +use crate::types::err::{Error, ErrorCode}; +use crate::types::filter::Filters; use crate::types::record::Vector; use crate::utils::file; use serde::{Deserialize, Serialize}; diff --git a/src/indices/idx_flat.rs b/src/indices/idx_flat.rs index 4a46f9e4..2a6dc3c8 100644 --- a/src/indices/idx_flat.rs +++ b/src/indices/idx_flat.rs @@ -9,19 +9,24 @@ use std::collections::BinaryHeap; #[derive(Debug, Serialize, Deserialize)] pub struct IndexFlat { config: SourceConfig, - metric: DistanceMetric, + params: ParamsFlat, metadata: IndexMetadata, data: HashMap, } impl IndexOps for IndexFlat { - fn new(config: SourceConfig, metric: DistanceMetric) -> Self { - Self { + fn new( + config: SourceConfig, + params: impl IndexParams, + ) -> Result { + let index = IndexFlat { config, - metric, + params: ParamsFlat::from_trait(params)?, metadata: IndexMetadata::default(), data: HashMap::new(), - } + }; + + Ok(index) } } @@ -31,7 +36,7 @@ impl VectorIndex for IndexFlat { } fn metric(&self) -> &DistanceMetric { - &self.metric + &self.params.metric } fn metadata(&self) -> &IndexMetadata { @@ -76,7 +81,7 @@ impl VectorIndex for IndexFlat { ) -> Result, Error> { let mut results = BinaryHeap::new(); for (id, record) in &self.data { - let distance = self.metric.distance(&record.vector, &query); + let distance = self.metric().distance(&record.vector, &query); let data = record.data.clone(); results.push(SearchResult { id: *id, distance, data }); @@ -100,14 +105,16 @@ impl VectorIndex for IndexFlat { let mut results = BinaryHeap::new(); for (id, record) in &self.data { - if filters.apply(&record.data) { - let distance = self.metric.distance(&record.vector, &query); - let data = record.data.clone(); - results.push(SearchResult { id: *id, distance, data }); - - if results.len() > k { - results.pop(); - } + if !filters.apply(&record.data) { + continue; + } + + let distance = self.metric().distance(&record.vector, &query); + let data = record.data.clone(); + + results.push(SearchResult { id: *id, distance, data }); + if results.len() > k { + results.pop(); } } @@ -119,6 +126,32 @@ impl VectorIndex for IndexFlat { } } +/// Flat index parameters. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ParamsFlat { + /// Formula used to calculate the distance between vectors. + pub metric: DistanceMetric, +} + +impl IndexParams for ParamsFlat { + fn metric(&self) -> &DistanceMetric { + &self.metric + } + + fn from_trait(params: impl IndexParams) -> Result { + let params = params + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::invalid_params("flat"))?; + + Ok(params.to_owned()) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + #[cfg(test)] mod tests { use super::*; @@ -126,8 +159,9 @@ mod tests { #[test] fn test_flat_index() { let config = SourceConfig::default(); - let metric = DistanceMetric::Euclidean; - let mut index = IndexFlat::new(config, metric); + let params = ParamsFlat::default(); + let mut index = IndexFlat::new(config, params).unwrap(); + index_tests::populate_index(&mut index); index_tests::test_search(&index); index_tests::test_search_with_filters(&index); diff --git a/src/indices/mod.rs b/src/indices/mod.rs index 022b3828..ca8e9cbd 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -1,4 +1,6 @@ -use crate::types::err::*; +use crate::types::distance::DistanceMetric; +use crate::types::err::{Error, ErrorCode}; +use crate::types::filter::*; use crate::types::record::*; use crate::utils::file; use rayon::prelude::*; @@ -12,12 +14,10 @@ use std::path::Path; mod idx_flat; -pub use idx_flat::IndexFlat; +pub use idx_flat::{IndexFlat, ParamsFlat}; -pub use crate::types::distance::DistanceMetric; -pub use crate::types::filter::*; - -type TableName = String; +/// Name of the SQL table to use as a data source. +pub type TableName = String; /// Type of SQL database used as a data source. #[allow(missing_docs)] @@ -187,32 +187,48 @@ impl SourceConfig { /// Algorithm options used to index and search vectors. #[allow(missing_docs)] -#[derive(Debug, PartialEq, Eq, Clone)] -#[derive(Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub enum IndexAlgorithm { - Flat, // -> IndexFlat + Flat(ParamsFlat), // -> IndexFlat +} + +impl IndexAlgorithm { + /// Returns the name of the algorithm in uppercase. + pub fn name(&self) -> &str { + match self { + Self::Flat(_) => "FLAT", + } + } +} + +impl PartialEq for IndexAlgorithm { + fn eq(&self, other: &Self) -> bool { + self.name() == other.name() + } } impl IndexAlgorithm { /// Initializes a new index based on the algorithm and configuration. + /// - `config`: Source configuration for the index. pub(crate) fn initialize( &self, config: SourceConfig, - metric: DistanceMetric, - ) -> Box { - let index = match self { - IndexAlgorithm::Flat => IndexFlat::new(config, metric), + ) -> Result, Error> { + let index = match self.to_owned() { + Self::Flat(params) => IndexFlat::new(config, params)?, }; - Box::new(index) + Ok(Box::new(index)) } pub(crate) fn load_index( &self, path: impl AsRef, ) -> Result, Error> { + // We can safely ignore the parameter inside of the algorithm here + // since the parameter is stored directly inside of the index. match self { - IndexAlgorithm::Flat => { + Self::Flat(_) => { let index = Self::_load_index::(path)?; Ok(Box::new(index)) } @@ -228,9 +244,7 @@ impl IndexAlgorithm { index: &dyn VectorIndex, ) -> Result<(), Error> { match self { - IndexAlgorithm::Flat => { - Self::_persist_index::(path, index) - } + Self::Flat(_) => Self::_persist_index::(path, index), } } @@ -273,7 +287,7 @@ pub struct SearchResult { /// ID of the record in the data source. pub id: RecordID, /// Record metadata. - pub data: HashMap>, + pub data: HashMap>, /// Distance between the query and the record. pub distance: f32, } @@ -298,14 +312,15 @@ impl Ord for SearchResult { } } -/// Trait for initializing a new index implementation. -/// -/// This will be used by the IndexAlgorithm enum to initialize a new index -/// based on the algorithm and configuration. In addition to this trait, -/// the index struct should implement the VectorIndex trait. +/// Trait for a new index implementation. pub trait IndexOps: Debug + Serialize + DeserializeOwned { /// Initializes an empty index with the given configuration. - fn new(config: SourceConfig, metric: DistanceMetric) -> Self; + /// - `config`: Source configuration for the index. + /// - `params`: Index specific parameters. + fn new( + config: SourceConfig, + params: impl IndexParams, + ) -> Result; /// Reads and deserializes the index from a file. fn load(path: impl AsRef) -> Result { @@ -318,23 +333,7 @@ pub trait IndexOps: Debug + Serialize + DeserializeOwned { } } -/// Trait for vector index implementations. -/// -/// For each index algorithm, a separate struct and implementation -/// of this trait is required. Roughly, the index struct should look like: -/// -/// ```text -/// use super::*; -/// -/// #[derive(Debug, Serialize, Deserialize)] -/// pub struct Index{{ Algorithm }} { -/// config: SourceConfig, -/// metric: DistanceMetric, -/// metadata: IndexMetadata, -/// data: HashMap, -/// // Other fields... -/// } -/// ``` +/// Trait for operating vector index implementations. pub trait VectorIndex: Debug + Send + Sync { /// Returns the configuration of the index. fn config(&self) -> &SourceConfig; @@ -382,9 +381,22 @@ pub trait VectorIndex: Debug + Send + Sync { /// Hides certain records from the search result permanently. fn hide(&mut self, record_ids: Vec) -> Result<(), Error>; - /// Returns the index as Any type for dynamic casting. This method - /// allows the index to be downcast to a specific index struct to - /// be serialized and stored in a file. + /// Returns the index as Any type for dynamic casting. + /// + /// This method allows the index trait object to be downcast to a + /// specific index struct to be serialized and stored in a file. + fn as_any(&self) -> &dyn Any; +} + +/// Trait for custom index parameters. +pub trait IndexParams: Debug + Default + Clone { + /// Returns the distance metric set in the parameters. + fn metric(&self) -> &DistanceMetric; + + /// Converts a trait object to a concrete parameter type. + fn from_trait(params: impl IndexParams) -> Result; + + /// Returns the parameters as Any type for dynamic casting. fn as_any(&self) -> &dyn Any; } @@ -423,7 +435,7 @@ mod index_tests { let vector = Vector::from(vec![i as f32; 128]); let data = HashMap::from([( "number".into(), - Some(RecordData::Integer(1000 + i)), + Some(DataValue::Integer(1000 + i)), )]); let record = Record { vector, data }; diff --git a/src/prelude/mod.rs b/src/prelude/mod.rs index 3cb3f6a8..70c12385 100644 --- a/src/prelude/mod.rs +++ b/src/prelude/mod.rs @@ -1,4 +1,6 @@ pub use crate::db::Database; pub use crate::indices::*; -pub use crate::types::err::*; +pub use crate::types::distance::DistanceMetric; +pub use crate::types::err::{Error, ErrorCode}; +pub use crate::types::filter::*; pub use crate::types::record::*; diff --git a/src/types/distance.rs b/src/types/distance.rs index 1d32caa8..9c529ec0 100644 --- a/src/types/distance.rs +++ b/src/types/distance.rs @@ -3,7 +3,8 @@ use serde::{Deserialize, Serialize}; use simsimd::SpatialSimilarity; /// Distance metric used to compare vectors in the index. -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Serialize, Deserialize, Clone, Copy, Hash)] pub enum DistanceMetric { /// Squared [Euclidean distance](https://www.geeksforgeeks.org/euclidean-distance) /// @@ -15,6 +16,12 @@ pub enum DistanceMetric { Cosine, } +impl Default for DistanceMetric { + fn default() -> Self { + DistanceMetric::Euclidean + } +} + impl DistanceMetric { /// Computes the distance between two vectors. pub fn distance(&self, a: &Vector, b: &Vector) -> f32 { @@ -26,6 +33,13 @@ impl DistanceMetric { DistanceMetric::Cosine => f32::cosine(a, b), }; + if dist.is_none() + || dist.unwrap().is_nan() + || dist.unwrap().is_infinite() + { + return f32::MAX; + } + dist.unwrap() as f32 } } diff --git a/src/types/err.rs b/src/types/err.rs index aa408156..7232d165 100644 --- a/src/types/err.rs +++ b/src/types/err.rs @@ -19,6 +19,7 @@ pub enum ErrorCode { InvalidID, InvalidMetadata, InvalidVector, + InvalidParameter, // Other generic errors. InternalError, @@ -45,6 +46,16 @@ impl Error { pub fn new(code: ErrorCode, message: impl Into) -> Self { Self { code, message: message.into() } } + + /// Creates a new error instance when failing to downcast + /// the parameters of a specific indexing algorithm from a trait object. + /// - `algorithm`: The name of the index algorithm in lowercase. + pub(crate) fn invalid_params(algorithm: impl AsRef) -> Self { + let name = algorithm.as_ref().to_uppercase(); + let message = format!("Invalid parameters for {name} index."); + let code = ErrorCode::InvalidParameter; + Self::new(code, message) + } } impl Display for Error { diff --git a/src/types/filter.rs b/src/types/filter.rs index 744dcf5a..ac5ddcf0 100644 --- a/src/types/filter.rs +++ b/src/types/filter.rs @@ -15,10 +15,7 @@ pub enum Filters { } impl Filters { - pub fn apply( - &self, - data: &HashMap>, - ) -> bool { + pub fn apply(&self, data: &HashMap>) -> bool { match self { Filters::NONE => true, Filters::AND(filters) => filters.iter().all(|f| f.apply(data)), @@ -60,22 +57,19 @@ impl From<&str> for Filters { #[derive(Debug, PartialEq)] pub struct Filter { pub column: ColumnName, - pub value: RecordData, + pub value: DataValue, pub operator: FilterOperator, } impl Filter { - pub fn apply( - &self, - data: &HashMap>, - ) -> bool { + pub fn apply(&self, data: &HashMap>) -> bool { let value = match data.get(&self.column).unwrap_or(&None) { Some(value) => value, None => return false, }; // This alias helps us cut down lines of code. - type Type = RecordData; + type Type = DataValue; match (value, &self.value) { (Type::Boolean(a), Type::Boolean(b)) => self.match_boolean(*a, *b), (Type::Float(a), Type::Float(b)) => self.match_number(a, b), @@ -127,7 +121,7 @@ impl From<&str> for Filter { let column = parts[0].into(); let operator = FilterOperator::from(parts[1]); - let value = RecordData::from(parts[2]); + let value = DataValue::from(parts[2]); Filter { column, value, operator } } } @@ -168,13 +162,13 @@ impl From<&str> for FilterOperator { mod tests { use super::*; - fn create_test_data() -> HashMap> { + fn create_test_data() -> HashMap> { let columns = vec!["name", "age", "gpa", "active"]; - let values: Vec = vec![ + let values: Vec = vec![ "Alice".into(), - RecordData::Integer(20), - RecordData::Float(3.5), - RecordData::Boolean(true), + DataValue::Integer(20), + DataValue::Float(3.5), + DataValue::Boolean(true), ]; let mut data = HashMap::new(); @@ -200,13 +194,13 @@ mod tests { let expected = { let filter_gpa = Filter { column: "gpa".into(), - value: RecordData::Float(3.0), + value: DataValue::Float(3.0), operator: FilterOperator::GreaterThanOrEqual, }; let filter_age = Filter { column: "age".into(), - value: RecordData::Integer(21), + value: DataValue::Integer(21), operator: FilterOperator::LessThan, }; diff --git a/src/types/record.rs b/src/types/record.rs index 3d9f8ce9..d0631c25 100644 --- a/src/types/record.rs +++ b/src/types/record.rs @@ -21,11 +21,24 @@ pub struct Record { /// Vector embedding. pub vector: Vector, /// Additional metadata of the record. - pub data: HashMap>, + pub data: HashMap>, +} + +/// Record data type stored in PQ-based indices. +/// +/// This data type is very similar to the standard Record type +/// except that the vector stored within is quantized using the +/// Product Quantization (PQ) method. +#[derive(Debug, Serialize, Deserialize)] +pub struct RecordPQ { + /// Product quantized embedding. + pub vector: VectorPQ, + /// Additional metadata of the record. + pub data: HashMap>, } -#[derive(Debug, Clone, Serialize, Deserialize)] /// Vector data type stored in the index. +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct Vector(pub Box<[f16]>); impl Vector { @@ -34,7 +47,7 @@ impl Vector { self.0.iter().map(|v| v.to_f32()).collect() } - /// Returns the dimension of the vector. + /// Returns the length of the vector. pub fn len(&self) -> usize { self.0.len() } @@ -51,25 +64,36 @@ impl From> for Vector { } } +/// Product quantized vector data type stored in the index. +#[derive(Debug, Serialize, Deserialize)] +pub struct VectorPQ(pub Box<[u8]>); + +impl VectorPQ { + /// Returns the vector data as a vector of u8. + pub fn to_vec(&self) -> Vec { + self.0.iter().copied().collect() + } +} + /// Data types supported as metadata in the index. #[allow(missing_docs)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub enum RecordData { +pub enum DataValue { Boolean(bool), Float(f32), Integer(isize), String(String), } -// RecordData interoperability with primitive types. +// DataValue interoperability with primitive types. -impl From for RecordData { +impl From for DataValue { fn from(value: String) -> Self { - RecordData::from(value.as_str()) + DataValue::from(value.as_str()) } } -impl From<&str> for RecordData { +impl From<&str> for DataValue { fn from(value: &str) -> Self { // Parsing integer must be done before float. // Since integer can be parsed as float but not vice versa. @@ -85,25 +109,25 @@ impl From<&str> for RecordData { return boolean.into(); } - RecordData::String(value.to_string()) + DataValue::String(value.to_string()) } } -impl From for RecordData { +impl From for DataValue { fn from(value: f32) -> Self { - RecordData::Float(value) + DataValue::Float(value) } } -impl From for RecordData { +impl From for DataValue { fn from(value: isize) -> Self { - RecordData::Integer(value) + DataValue::Integer(value) } } -impl From for RecordData { +impl From for DataValue { fn from(value: bool) -> Self { - RecordData::Boolean(value) + DataValue::Boolean(value) } } @@ -168,7 +192,7 @@ impl RowOps for Vector { } } -impl RowOps for Option { +impl RowOps for Option { fn from_row( column_name: impl Into, row: &AnyRow, @@ -183,7 +207,7 @@ impl RowOps for Option { if value_type.is_integer() { let value: i64 = row.try_get::(&column)?; - return Ok(Some(RecordData::Integer(value as isize))); + return Ok(Some(DataValue::Integer(value as isize))); } // Handle types other than null and integer below. @@ -191,19 +215,19 @@ impl RowOps for Option { let data = match value_type { SQLType::Text => { let value = row.try_get::(&column)?; - RecordData::String(value.to_string()) + DataValue::String(value.to_string()) } SQLType::Bool => { let value: bool = row.try_get::(&column)?; - RecordData::Boolean(value) + DataValue::Boolean(value) } SQLType::Real => { let value: f32 = row.try_get::(&column)?; - RecordData::Float(value) + DataValue::Float(value) } SQLType::Double => { let value: f64 = row.try_get::(&column)?; - RecordData::Float(value as f32) + DataValue::Float(value as f32) } _ => { let code = ErrorCode::InvalidMetadata; diff --git a/src/utils/kmeans.rs b/src/utils/kmeans.rs new file mode 100644 index 00000000..1bbe9d99 --- /dev/null +++ b/src/utils/kmeans.rs @@ -0,0 +1,142 @@ +use crate::types::distance::DistanceMetric; +use crate::types::record::Vector; +use rand::Rng; +use rayon::prelude::*; + +#[derive(Debug, Clone, Copy, Default, Hash)] +#[derive(PartialEq, Eq, PartialOrd, Ord)] +pub struct ClusterID(pub u16); + +#[derive(Debug)] +pub struct KMeans { + num_centroids: usize, + num_iterations: u8, + metric: DistanceMetric, + assignment: Vec, // Cluster assignment for each vector. + centroids: Vec, // Centroids of each cluster. +} + +impl KMeans { + /// Creates a new KMeans model. + pub fn new( + num_centroids: usize, + num_iterations: u8, + metric: DistanceMetric, + ) -> Self { + Self { + num_centroids, + num_iterations, + metric, + assignment: vec![], + centroids: vec![], + } + } + + /// Fits the KMeans model to the given vectors. + pub fn fit(&mut self, vectors: &[Vector]) { + self.centroids = self.initialize_centroids(vectors); + + for _ in 0..self.num_iterations { + self.assignment = self.assign_clusters(vectors); + self.centroids = self.update_centroids(vectors); + } + } + + fn initialize_centroids(&self, vectors: &[Vector]) -> Vec { + let mut rng = rand::thread_rng(); + let mut centroids = Vec::with_capacity(self.num_centroids); + for _ in 0..self.num_centroids { + let index = rng.gen_range(0..vectors.len()); + centroids.push(vectors[index].to_owned()); + } + + centroids + } + + fn assign_clusters(&self, vectors: &[Vector]) -> Vec { + let assign = |vector| self.find_closest_centroid(vector); + vectors.par_iter().map(assign).collect() + } + + fn update_centroids(&self, vectors: &[Vector]) -> Vec { + let k = self.num_centroids; + let mut counts = vec![0; k]; + + let mut sums = { + let dimension = vectors[0].len(); + let zeros = vec![0.0; dimension]; + vec![zeros; k] + }; + + for (i, vector) in vectors.iter().enumerate() { + let cluster_id = self.assignment[i].0 as usize; + counts[cluster_id] += 1; + + sums[cluster_id] + .par_iter_mut() + .zip(vector.to_vec().par_iter()) + .for_each(|(sum, v)| { + *sum += v; + }); + } + + for i in 0..self.num_centroids { + sums[i].par_iter_mut().for_each(|sum| { + *sum /= counts[i] as f32; + }); + } + + sums.into_iter().map(|v| v.into()).collect() + } + + /// Finds the closest centroid to a given vector. + /// - `vector`: Vector to compare with the centroids. + pub fn find_closest_centroid(&self, vector: &Vector) -> ClusterID { + self.centroids + .par_iter() + .map(|centroid| self.metric.distance(vector, centroid)) + .enumerate() + .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) + .map(|(i, _)| ClusterID(i as u16)) + .unwrap_or_default() + } + + /// Returns the cluster assignment for each vector. + pub fn assignments(&self) -> &[ClusterID] { + &self.assignment + } + + /// Returns the centroids of each cluster. + pub fn centroids(&self) -> &[Vector] { + &self.centroids + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_kmeans_fit() { + let mut vectors = vec![]; + for i in 0..100 { + let vector = Vector::from(vec![i as f32; 2]); + vectors.push(vector); + } + + let mut kmeans = KMeans::new(5, 20, DistanceMetric::Euclidean); + kmeans.fit(&vectors); + + let mut correct_count = 0; + for (i, clusted_id) in kmeans.assignments().iter().enumerate() { + let vector = &vectors[i]; + let closest_centroid = kmeans.find_closest_centroid(vector); + if clusted_id == &closest_centroid { + correct_count += 1; + } + } + + let accuracy = correct_count as f32 / vectors.len() as f32; + assert!(accuracy > 0.95); + } +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 2e172cd0..eae901e7 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1 +1,2 @@ pub mod file; +pub mod kmeans; From 32e3d4b3030bc73c2ad4ac535ccb0f1b920aeb50 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Fri, 19 Jul 2024 18:28:55 -0500 Subject: [PATCH 56/88] feat: add ivfpq --- src/db/database.rs | 34 +--- src/indices/idx_flat.rs | 48 +---- src/indices/idx_ivfpq.rs | 385 +++++++++++++++++++++++++++++++++++++++ src/indices/mod.rs | 65 ++++--- src/types/distance.rs | 9 +- src/types/err.rs | 10 - src/types/record.rs | 20 +- src/utils/kmeans.rs | 89 ++++++--- 8 files changed, 519 insertions(+), 141 deletions(-) create mode 100644 src/indices/idx_ivfpq.rs diff --git a/src/db/database.rs b/src/db/database.rs index 3320bbe8..41d766a2 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -225,27 +225,12 @@ impl Database { executor::block_on(self.async_refresh_index(name)) } - /// Searches the index for the nearest vectors to the query vector. - /// - `name`: Index name. - /// - `query`: Query vector. - /// - `k`: Number of nearest neighbors to return. - pub fn search_index( - &self, - name: impl AsRef, - query: impl Into, - k: usize, - ) -> Result, Error> { - let index: Index = self.try_get_index(name)?; - let index = index.lock()?; - index.search(query.into(), k) - } - - /// Searches the index for nearest neighbors with post-search filters. + /// Searches the index for nearest neighbors. /// - `name`: Index name. /// - `query`: Query vector. /// - `k`: Number of nearest neighbors to return. /// - `filters`: SQL-like filters to apply. - pub fn search_index_with_filters( + pub fn search_index( &self, name: impl AsRef, query: impl Into, @@ -254,7 +239,7 @@ impl Database { ) -> Result, Error> { let index: Index = self.try_get_index(name)?; let index = index.lock()?; - index.search_with_filters(query.into(), k, filters.into()) + index.search(query.into(), k, filters.into()) } /// Rebuilds the index from the existing records in the index. @@ -493,10 +478,11 @@ mod tests { } #[test] - fn test_database_search_index() { + fn test_database_search_index_basic() { let db = create_test_database().unwrap(); - let query = vec![0.0; 128]; - let results = db.search_index(TEST_INDEX, query, 5).unwrap(); + let results = db + .search_index(TEST_INDEX, vec![0.0; 128], 5, Filters::NONE) + .unwrap(); assert_eq!(results.len(), 5); assert_eq!(results[0].id, RecordID(1)); @@ -504,12 +490,10 @@ mod tests { } #[test] - fn test_database_search_index_with_filters() { + fn test_database_search_index_advanced() { let db = create_test_database().unwrap(); - let query = vec![0.0; 128]; - let filters = Filters::from("data >= 1050"); let results = db - .search_index_with_filters(TEST_INDEX, query, 5, filters) + .search_index(TEST_INDEX, vec![0.0; 128], 5, "data >= 1050") .unwrap(); assert_eq!(results.len(), 5); diff --git a/src/indices/idx_flat.rs b/src/indices/idx_flat.rs index 2a6dc3c8..25439bb5 100644 --- a/src/indices/idx_flat.rs +++ b/src/indices/idx_flat.rs @@ -1,5 +1,4 @@ use super::*; -use std::collections::BinaryHeap; /// Flat index implementation. /// @@ -21,7 +20,7 @@ impl IndexOps for IndexFlat { ) -> Result { let index = IndexFlat { config, - params: ParamsFlat::from_trait(params)?, + params: downcast_params(params)?, metadata: IndexMetadata::default(), data: HashMap::new(), }; @@ -55,9 +54,9 @@ impl VectorIndex for IndexFlat { Ok(()) } - /// Refitting doesn't do anything for the flat index as - /// incremental insertion or deletion will directly update - /// the data store accordingly guaranteeing the index optimal state. + /// Refitting doesn't do anything for the flat index as incremental + /// insertion or deletion will directly update the data store + /// accordingly which guarantee the optimal state of the index. fn refit(&mut self) -> Result<(), Error> { Ok(()) } @@ -78,41 +77,19 @@ impl VectorIndex for IndexFlat { &self, query: Vector, k: usize, - ) -> Result, Error> { - let mut results = BinaryHeap::new(); - for (id, record) in &self.data { - let distance = self.metric().distance(&record.vector, &query); - let data = record.data.clone(); - results.push(SearchResult { id: *id, distance, data }); - - if results.len() > k { - results.pop(); - } - } - - Ok(results.into_sorted_vec()) - } - - fn search_with_filters( - &self, - query: Vector, - k: usize, filters: Filters, ) -> Result, Error> { - if filters == Filters::NONE { - return self.search(query, k); - } - let mut results = BinaryHeap::new(); for (id, record) in &self.data { + // Skip records that don't pass the filters. if !filters.apply(&record.data) { continue; } let distance = self.metric().distance(&record.vector, &query); let data = record.data.clone(); - results.push(SearchResult { id: *id, distance, data }); + if results.len() > k { results.pop(); } @@ -138,15 +115,6 @@ impl IndexParams for ParamsFlat { &self.metric } - fn from_trait(params: impl IndexParams) -> Result { - let params = params - .as_any() - .downcast_ref::() - .ok_or_else(|| Error::invalid_params("flat"))?; - - Ok(params.to_owned()) - } - fn as_any(&self) -> &dyn Any { self } @@ -163,7 +131,7 @@ mod tests { let mut index = IndexFlat::new(config, params).unwrap(); index_tests::populate_index(&mut index); - index_tests::test_search(&index); - index_tests::test_search_with_filters(&index); + index_tests::test_basic_search(&index); + index_tests::test_advanced_search(&index); } } diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs new file mode 100644 index 00000000..5391df1a --- /dev/null +++ b/src/indices/idx_ivfpq.rs @@ -0,0 +1,385 @@ +use super::*; +use crate::utils::kmeans::{KMeans, Vectors}; +use std::rc::Rc; + +#[derive(Debug, Serialize, Deserialize)] +pub struct IndexIVFPQ { + config: SourceConfig, + params: ParamsIVFPQ, + metadata: IndexMetadata, + data: HashMap, + + // IVFPQ specific data structures. + centroids: Vec, + clusters: Vec>, + codebook: Vec>, +} + +impl IndexIVFPQ { + fn build( + &mut self, + records: HashMap, + ) -> Result<(), Error> { + if records.is_empty() { + return Ok(()); + } + + let vectors = records + .values() + .map(|record| &record.vector) + .collect::>(); + + let vectors: Vectors = Rc::from(vectors.as_slice()); + self.create_codebook(vectors.clone()); + + let (centroids, assignments) = { + let mut kmeans = KMeans::new( + self.params.centroids, + self.params.num_iterations, + self.metric().to_owned(), + ); + + kmeans.fit(vectors.clone()); + (kmeans.centroids().to_vec(), kmeans.assignments().to_vec()) + }; + + self.centroids = centroids; + self.clusters = { + let mut clusters = vec![vec![]; self.params.centroids]; + let ids = records.keys().collect::>(); + for (i, &cluster) in assignments.iter().enumerate() { + clusters[cluster.0 as usize].push(ids[i].to_owned()); + } + + clusters + }; + + self.metadata.count = records.len(); + self.metadata.last_inserted = records.keys().max().copied(); + + self.data = records + .into_iter() + .map(|(id, record)| { + let vector = self.quantize_vector(&record.vector); + let data = record.data; + (id, RecordPQ { vector, data }) + }) + .collect(); + + Ok(()) + } + + fn insert( + &mut self, + records: HashMap, + ) -> Result<(), Error> { + if records.is_empty() { + return Ok(()); + } + + let assignments = records + .values() + .map(|record| self.find_nearest_centroids(&record.vector, 1)[0]) + .collect::>(); + + let ids: Vec<&RecordID> = records.keys().collect(); + for (i, cluster_id) in assignments.iter().enumerate() { + self.clusters[*cluster_id].push(ids[i].to_owned()); + } + + self.metadata.count += records.len(); + self.metadata.last_inserted = records.keys().max().copied(); + + let records: HashMap = records + .into_par_iter() + .map(|(id, record)| { + let vector = self.quantize_vector(&record.vector); + let data = record.data; + (id, RecordPQ { vector, data }) + }) + .collect(); + + self.data.par_extend(records); + Ok(()) + } + + fn create_codebook(&mut self, vectors: Vectors) { + for i in 0..self.params.sub_dimension { + let mut subvectors = Vec::new(); + for vector in vectors.iter() { + let subvector = self.get_subvector(i.into(), vector); + subvectors.push(subvector); + } + + let centroids = { + let mut kmeans = KMeans::new( + self.params.sub_centroids as usize, + self.params.num_iterations, + self.params.metric, + ); + + let subvectors: Vec<&Vector> = subvectors.iter().collect(); + kmeans.fit(Rc::from(subvectors)); + kmeans.centroids().to_vec() + }; + + self.codebook[i as usize] = centroids + .par_iter() + .map(|centroid| centroid.clone().into()) + .collect(); + } + } + + fn find_nearest_centroids(&self, vector: &Vector, k: usize) -> Vec { + let mut distances: Vec<(usize, f32)> = self + .centroids + .iter() + .enumerate() + .map(|(i, center)| (i, self.metric().distance(center, vector))) + .collect(); + + distances.sort_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()); + distances.into_iter().take(k).map(|(i, _)| i).collect() + } + + fn find_nearest_code( + &self, + part_index: usize, + subvector: &Vector, + ) -> usize { + self.codebook[part_index] + .iter() + .map(|centroid| self.metric().distance(centroid, subvector)) + .enumerate() + .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) + .unwrap_or_default() + .0 + } + + fn quantize_vector(&self, vector: &Vector) -> VectorPQ { + let sub_dimension = self.params.sub_dimension as usize; + let mut pq = Vec::with_capacity(sub_dimension); + + for i in 0..sub_dimension { + let subvector = self.get_subvector(i, vector); + let centroid_id = self.find_nearest_code(i, &subvector); + pq.push(centroid_id as u8); + } + + pq.into() + } + + fn dequantize_vector(&self, vector_pq: &VectorPQ) -> Vector { + let mut vector = vec![]; + for (i, centroid_id) in vector_pq.0.iter().enumerate() { + let centroid = &self.codebook[i][*centroid_id as usize]; + vector.extend(centroid.to_vec()); + } + + vector.into() + } + + fn get_subvector(&self, part_index: usize, vector: &Vector) -> Vector { + let dim = vector.len() / self.params.sub_dimension as usize; + let start = part_index as usize * dim; + let end = (part_index + 1) as usize * dim; + let subvector = vector.0[start..end].to_vec(); + Vector(subvector.into_boxed_slice()) + } +} + +impl IndexOps for IndexIVFPQ { + fn new( + config: SourceConfig, + params: impl IndexParams, + ) -> Result { + let params = downcast_params::(params)?; + let codebook = vec![vec![]; params.sub_dimension as usize]; + let clusters = vec![vec![]; params.centroids]; + + let index = IndexIVFPQ { + config, + params, + metadata: IndexMetadata::default(), + data: HashMap::new(), + + centroids: vec![], + clusters, + codebook, + }; + + Ok(index) + } +} + +impl VectorIndex for IndexIVFPQ { + fn config(&self) -> &SourceConfig { + &self.config + } + + fn metric(&self) -> &DistanceMetric { + &self.params.metric + } + + fn metadata(&self) -> &IndexMetadata { + &self.metadata + } + + fn fit(&mut self, records: HashMap) -> Result<(), Error> { + match self.metadata.count { + 0 => self.build(records), + _ => self.insert(records), + } + } + + fn refit(&mut self) -> Result<(), Error> { + self.data.retain(|id, _| !self.metadata.hidden.contains(id)); + + let records = self + .data + .par_iter() + .map(|(id, record)| { + let vector = self.dequantize_vector(&record.vector); + let data = record.data.clone(); + (*id, Record { vector, data }) + }) + .collect(); + + self.build(records) + } + + fn search( + &self, + query: Vector, + k: usize, + filters: Filters, + ) -> Result, Error> { + let nearest_centroids = { + let nprobes = self.params.num_probes as usize; + self.find_nearest_centroids(&query, nprobes) + }; + + let mut results = BinaryHeap::new(); + for centroid_id in nearest_centroids { + let cluster = &self.clusters[centroid_id]; + for &record_id in cluster { + let record = self.data.get(&record_id).unwrap(); + let data = record.data.clone(); + if !filters.apply(&data) { + continue; + } + + let vector = self.dequantize_vector(&record.vector); + let distance = self.metric().distance(&vector, &query); + results.push(SearchResult { id: record_id, distance, data }); + + if results.len() > k { + results.pop(); + } + } + } + + Ok(results.into_sorted_vec()) + } + + fn hide(&mut self, record_ids: Vec) -> Result<(), Error> { + self.metadata.hidden.extend(record_ids); + Ok(()) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ParamsIVFPQ { + /// Number of centroids in the IVF. + pub centroids: usize, + /// Number of iterations to run the KMeans algorithm. + pub num_iterations: usize, + /// Number of centroids in the PQ sub-space. + pub sub_centroids: u8, + /// Dimension of the vector after PQ encoding. + pub sub_dimension: u8, + /// Number of clusters to explore during search. + pub num_probes: u8, + /// Metric used to compute the distance between vectors. + pub metric: DistanceMetric, +} + +impl Default for ParamsIVFPQ { + fn default() -> Self { + Self { + num_iterations: 100, + centroids: 256, + sub_centroids: 32, + sub_dimension: 16, + num_probes: 4, + metric: DistanceMetric::Euclidean, + } + } +} + +impl IndexParams for ParamsIVFPQ { + fn metric(&self) -> &DistanceMetric { + &self.metric + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_product_quantization() { + let data: Vec = vec![ + vec![1.0, 2.0, 3.0, 4.0].into(), + vec![5.0, 6.0, 7.0, 8.0].into(), + vec![9.0, 10.0, 11.0, 12.0].into(), + vec![13.0, 14.0, 15.0, 16.0].into(), + ]; + + let vectors: Vectors = { + let data = data.iter().collect::>(); + Rc::from(data.as_slice()) + }; + + let params = ParamsIVFPQ { + num_iterations: 10, + sub_centroids: 8, + sub_dimension: 2, + ..Default::default() + }; + + let mut index = create_test_index(params); + index.create_codebook(vectors); + + let encoded = index.quantize_vector(&data[0]); + let decoded = index.dequantize_vector(&encoded); + assert_eq!(decoded.to_vec(), data[0].to_vec()); + } + + #[test] + fn test_ivfpq_index() { + let params = ParamsIVFPQ { + centroids: 5, + num_iterations: 20, + ..Default::default() + }; + + let mut index = create_test_index(params); + index_tests::populate_index(&mut index); + index_tests::test_basic_search(&index); + index_tests::test_advanced_search(&index); + } + + fn create_test_index(params: ParamsIVFPQ) -> IndexIVFPQ { + let config = SourceConfig::default(); + IndexIVFPQ::new(config, params).unwrap() + } +} diff --git a/src/indices/mod.rs b/src/indices/mod.rs index ca8e9cbd..626859bc 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -8,13 +8,15 @@ use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use sqlx::any::AnyRow; use std::any::Any; -use std::collections::HashMap; +use std::collections::{BinaryHeap, HashMap}; use std::fmt::Debug; use std::path::Path; mod idx_flat; +mod idx_ivfpq; pub use idx_flat::{IndexFlat, ParamsFlat}; +pub use idx_ivfpq::{IndexIVFPQ, ParamsIVFPQ}; /// Name of the SQL table to use as a data source. pub type TableName = String; @@ -189,7 +191,8 @@ impl SourceConfig { #[allow(missing_docs)] #[derive(Debug, Clone, Serialize, Deserialize)] pub enum IndexAlgorithm { - Flat(ParamsFlat), // -> IndexFlat + Flat(ParamsFlat), // -> IndexFlat + IVFPQ(ParamsIVFPQ), // -> IndexIVFPQ } impl IndexAlgorithm { @@ -197,6 +200,7 @@ impl IndexAlgorithm { pub fn name(&self) -> &str { match self { Self::Flat(_) => "FLAT", + Self::IVFPQ(_) => "IVFPQ", } } } @@ -214,24 +218,31 @@ impl IndexAlgorithm { &self, config: SourceConfig, ) -> Result, Error> { - let index = match self.to_owned() { - Self::Flat(params) => IndexFlat::new(config, params)?, - }; - - Ok(Box::new(index)) + match self.to_owned() { + Self::Flat(params) => { + let index = IndexFlat::new(config, params)?; + Ok(Box::new(index)) + } + Self::IVFPQ(params) => { + let index = IndexIVFPQ::new(config, params)?; + Ok(Box::new(index)) + } + } } pub(crate) fn load_index( &self, path: impl AsRef, ) -> Result, Error> { - // We can safely ignore the parameter inside of the algorithm here - // since the parameter is stored directly inside of the index. match self { Self::Flat(_) => { let index = Self::_load_index::(path)?; Ok(Box::new(index)) } + Self::IVFPQ(_) => { + let index = Self::_load_index::(path)?; + Ok(Box::new(index)) + } } } @@ -245,6 +256,7 @@ impl IndexAlgorithm { ) -> Result<(), Error> { match self { Self::Flat(_) => Self::_persist_index::(path, index), + Self::IVFPQ(_) => Self::_persist_index::(path, index), } } @@ -358,20 +370,11 @@ pub trait VectorIndex: Debug + Send + Sync { /// the index after a certain threshold of incremental fitting. fn refit(&mut self) -> Result<(), Error>; - /// Searches for the nearest neighbors based on the query vector. - /// - `query`: Query vector. - /// - `k`: Number of nearest neighbors to return. - fn search( - &self, - query: Vector, - k: usize, - ) -> Result, Error>; - - /// Searches the nearest neighbors based on the query vector and filters. + /// Searches for the nearest neighbors of the query vector. /// - `query`: Query vector. /// - `k`: Number of nearest neighbors to return. /// - `filters`: Filters to apply to the search results. - fn search_with_filters( + fn search( &self, query: Vector, k: usize, @@ -393,13 +396,20 @@ pub trait IndexParams: Debug + Default + Clone { /// Returns the distance metric set in the parameters. fn metric(&self) -> &DistanceMetric; - /// Converts a trait object to a concrete parameter type. - fn from_trait(params: impl IndexParams) -> Result; - /// Returns the parameters as Any type for dynamic casting. fn as_any(&self) -> &dyn Any; } +pub(crate) fn downcast_params( + params: impl IndexParams, +) -> Result { + params.as_any().downcast_ref::().cloned().ok_or_else(|| { + let code = ErrorCode::InternalError; + let message = "Failed to downcast index parameters to concrete type."; + Error::new(code, message) + }) +} + #[cfg(test)] mod tests { use super::*; @@ -445,22 +455,21 @@ mod index_tests { index.fit(records).unwrap(); } - pub fn test_search(index: &impl VectorIndex) { + pub fn test_basic_search(index: &impl VectorIndex) { let query = Vector::from(vec![0.0; 128]); let k = 10; - let results = index.search(query, k).unwrap(); + let results = index.search(query, k, Filters::NONE).unwrap(); assert_eq!(results.len(), k); assert_eq!(results[0].id, RecordID(0)); - assert_eq!(results[0].distance, 0.0); assert_eq!(results[9].id, RecordID(9)); } - pub fn test_search_with_filters(index: &impl VectorIndex) { + pub fn test_advanced_search(index: &impl VectorIndex) { let query = Vector::from(vec![0.0; 128]); let k = 10; let filters = Filters::from("number > 1010"); - let results = index.search_with_filters(query, k, filters).unwrap(); + let results = index.search(query, k, filters).unwrap(); assert_eq!(results.len(), k); assert_eq!(results[0].id, RecordID(11)); diff --git a/src/types/distance.rs b/src/types/distance.rs index 9c529ec0..fc9e3dec 100644 --- a/src/types/distance.rs +++ b/src/types/distance.rs @@ -3,25 +3,20 @@ use serde::{Deserialize, Serialize}; use simsimd::SpatialSimilarity; /// Distance metric used to compare vectors in the index. -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord)] #[derive(Serialize, Deserialize, Clone, Copy, Hash)] pub enum DistanceMetric { /// Squared [Euclidean distance](https://www.geeksforgeeks.org/euclidean-distance) /// /// The squared Euclidean distance is used to avoid the square /// root operation thus making the computation faster. + #[default] Euclidean, /// Cosine distance (1 - cosine similarity): /// [Cosine similarity](https://www.geeksforgeeks.org/cosine-similarity/) Cosine, } -impl Default for DistanceMetric { - fn default() -> Self { - DistanceMetric::Euclidean - } -} - impl DistanceMetric { /// Computes the distance between two vectors. pub fn distance(&self, a: &Vector, b: &Vector) -> f32 { diff --git a/src/types/err.rs b/src/types/err.rs index 7232d165..f27f70e5 100644 --- a/src/types/err.rs +++ b/src/types/err.rs @@ -46,16 +46,6 @@ impl Error { pub fn new(code: ErrorCode, message: impl Into) -> Self { Self { code, message: message.into() } } - - /// Creates a new error instance when failing to downcast - /// the parameters of a specific indexing algorithm from a trait object. - /// - `algorithm`: The name of the index algorithm in lowercase. - pub(crate) fn invalid_params(algorithm: impl AsRef) -> Self { - let name = algorithm.as_ref().to_uppercase(); - let message = format!("Invalid parameters for {name} index."); - let code = ErrorCode::InvalidParameter; - Self::new(code, message) - } } impl Display for Error { diff --git a/src/types/record.rs b/src/types/record.rs index d0631c25..7108af37 100644 --- a/src/types/record.rs +++ b/src/types/record.rs @@ -39,6 +39,7 @@ pub struct RecordPQ { /// Vector data type stored in the index. #[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(PartialEq, PartialOrd)] pub struct Vector(pub Box<[f16]>); impl Vector { @@ -71,7 +72,24 @@ pub struct VectorPQ(pub Box<[u8]>); impl VectorPQ { /// Returns the vector data as a vector of u8. pub fn to_vec(&self) -> Vec { - self.0.iter().copied().collect() + self.0.to_vec() + } +} + +impl From> for VectorPQ { + fn from(value: Vec) -> Self { + VectorPQ(value.into_boxed_slice()) + } +} + +impl From for VectorPQ { + fn from(value: Vector) -> Self { + value + .to_vec() + .iter() + .map(|v| (v * 255.0).round() as u8) + .collect::>() + .into() } } diff --git a/src/utils/kmeans.rs b/src/utils/kmeans.rs index 1bbe9d99..590901eb 100644 --- a/src/utils/kmeans.rs +++ b/src/utils/kmeans.rs @@ -1,7 +1,10 @@ use crate::types::distance::DistanceMetric; use crate::types::record::Vector; -use rand::Rng; +use rand::seq::SliceRandom; use rayon::prelude::*; +use std::rc::Rc; + +pub type Vectors<'v> = Rc<[&'v Vector]>; #[derive(Debug, Clone, Copy, Default, Hash)] #[derive(PartialEq, Eq, PartialOrd, Ord)] @@ -10,7 +13,7 @@ pub struct ClusterID(pub u16); #[derive(Debug)] pub struct KMeans { num_centroids: usize, - num_iterations: u8, + num_iterations: usize, metric: DistanceMetric, assignment: Vec, // Cluster assignment for each vector. centroids: Vec, // Centroids of each cluster. @@ -20,7 +23,7 @@ impl KMeans { /// Creates a new KMeans model. pub fn new( num_centroids: usize, - num_iterations: u8, + num_iterations: usize, metric: DistanceMetric, ) -> Self { Self { @@ -33,36 +36,51 @@ impl KMeans { } /// Fits the KMeans model to the given vectors. - pub fn fit(&mut self, vectors: &[Vector]) { - self.centroids = self.initialize_centroids(vectors); + pub fn fit(&mut self, vectors: Vectors) { + // Cloning the vectors is acceptable because with Rc, we are + // only cloning the references, not the actual data. + self.centroids = self.initialize_centroids(vectors.clone()); + let mut repeat_centroids = 0; for _ in 0..self.num_iterations { - self.assignment = self.assign_clusters(vectors); - self.centroids = self.update_centroids(vectors); + if repeat_centroids > 5 { + break; + } + + self.assignment = self.assign_clusters(vectors.clone()); + let centroids = self.update_centroids(vectors.clone()); + + match self.centroids == centroids { + true => repeat_centroids += 1, + false => { + self.centroids = centroids; + repeat_centroids = 0; + } + } } } - fn initialize_centroids(&self, vectors: &[Vector]) -> Vec { + fn initialize_centroids(&self, vectors: Vectors) -> Vec { let mut rng = rand::thread_rng(); - let mut centroids = Vec::with_capacity(self.num_centroids); - for _ in 0..self.num_centroids { - let index = rng.gen_range(0..vectors.len()); - centroids.push(vectors[index].to_owned()); - } - - centroids + vectors + .choose_multiple(&mut rng, self.num_centroids) + .cloned() + .map(|vector| vector.to_owned()) + .collect() } - fn assign_clusters(&self, vectors: &[Vector]) -> Vec { - let assign = |vector| self.find_closest_centroid(vector); - vectors.par_iter().map(assign).collect() + fn assign_clusters(&self, vectors: Vectors) -> Vec { + vectors + .into_par_iter() + .map(|vector| self.find_nearest_centroid(vector)) + .collect() } - fn update_centroids(&self, vectors: &[Vector]) -> Vec { + fn update_centroids(&self, vectors: Vectors) -> Vec { let k = self.num_centroids; let mut counts = vec![0; k]; - let mut sums = { + let mut centroids = { let dimension = vectors[0].len(); let zeros = vec![0.0; dimension]; vec![zeros; k] @@ -72,7 +90,7 @@ impl KMeans { let cluster_id = self.assignment[i].0 as usize; counts[cluster_id] += 1; - sums[cluster_id] + centroids[cluster_id] .par_iter_mut() .zip(vector.to_vec().par_iter()) .for_each(|(sum, v)| { @@ -80,18 +98,24 @@ impl KMeans { }); } - for i in 0..self.num_centroids { - sums[i].par_iter_mut().for_each(|sum| { + for i in 0..k { + if counts[i] == 0 { + let mut rng = rand::thread_rng(); + centroids[i] = vectors.choose(&mut rng).unwrap().to_vec(); + continue; + } + + centroids[i].par_iter_mut().for_each(|sum| { *sum /= counts[i] as f32; }); } - sums.into_iter().map(|v| v.into()).collect() + centroids.into_iter().map(|v| v.into()).collect() } - /// Finds the closest centroid to a given vector. + /// Finds the nearest centroid to a given vector. /// - `vector`: Vector to compare with the centroids. - pub fn find_closest_centroid(&self, vector: &Vector) -> ClusterID { + pub fn find_nearest_centroid(&self, vector: &Vector) -> ClusterID { self.centroids .par_iter() .map(|centroid| self.metric.distance(vector, centroid)) @@ -124,14 +148,19 @@ mod tests { vectors.push(vector); } + let vectors: Vectors = { + let vectors_ref: Vec<&Vector> = vectors.iter().collect(); + Rc::from(vectors_ref.as_slice()) + }; + let mut kmeans = KMeans::new(5, 20, DistanceMetric::Euclidean); - kmeans.fit(&vectors); + kmeans.fit(vectors.clone()); let mut correct_count = 0; for (i, clusted_id) in kmeans.assignments().iter().enumerate() { - let vector = &vectors[i]; - let closest_centroid = kmeans.find_closest_centroid(vector); - if clusted_id == &closest_centroid { + let vector = vectors[i]; + let nearest_centroid = kmeans.find_nearest_centroid(vector); + if clusted_id == &nearest_centroid { correct_count += 1; } } From f36840e110d602d04ce6297f74bdb46c34087a4f Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 22 Jul 2024 20:08:18 -0500 Subject: [PATCH 57/88] refactor: improve docs and optimize code --- src/db/database.rs | 158 +++++++++++++++++++++++++++++---------- src/db/mod.rs | 2 +- src/indices/idx_flat.rs | 8 +- src/indices/idx_ivfpq.rs | 123 ++++++++++++++++++++++++------ src/indices/mod.rs | 136 ++++++++++++++++++++++++--------- src/types/distance.rs | 6 +- src/types/err.rs | 2 + src/types/filter.rs | 35 ++++++--- src/types/record.rs | 57 +++++++++----- src/utils/kmeans.rs | 39 +++++++++- 10 files changed, 426 insertions(+), 140 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index 41d766a2..6b0a2ad9 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -7,14 +7,19 @@ use std::sync::{Arc, Mutex}; use url::Url; use uuid::Uuid; +// Type aliases for better readability. type DatabaseURL = String; type IndexName = String; type IndexFile = PathBuf; - type Index = Arc>>; type IndicesPool = Mutex>; /// The vector database interface. +/// +/// The database is responsible for managing: +/// - Data flow between the source database and the indices. +/// - High-level indices operation and management. +/// - Persistance, retrieval, and in-memory pool of vector indices. pub struct Database { root: PathBuf, state: Mutex, @@ -43,23 +48,35 @@ impl Database { let root_dir: PathBuf = root.into(); let indices_dir = root_dir.join("indices"); if !indices_dir.try_exists()? { + // Creating the indices directory will also create + // the root directory if it doesn't exist. fs::create_dir_all(&indices_dir)?; } let state_file = root_dir.join("odbstate"); let state = if state_file.try_exists()? { - file::read_binary_file(state_file)? + let mut state = DatabaseState::restore(&state_file)?; + + // If the source URL is provided, update the state. + // This is useful in case the source URL has changed. + if let Some(source) = source_url { + state.with_source(source)?; + } + + state } else { let source = source_url.ok_or_else(|| { let code = ErrorCode::MissingSource; - let message = "Data source is required to create a database."; + let message = "Data source is required for a new database."; Error::new(code, message) })?; + let indices = HashMap::new(); let source = source.into(); DatabaseState::validate_source(&source)?; - let state = DatabaseState { source, indices: HashMap::new() }; + // Persist the new state to the state file. + let state = DatabaseState { source, indices }; file::write_binary_file(state_file, &state)?; state }; @@ -80,16 +97,12 @@ impl Database { algorithm: IndexAlgorithm, config: SourceConfig, ) -> Result<(), Error> { - // Create a new file where the index will be stored. - let index_file = { - let uuid = Uuid::new_v4().to_string(); - self.indices_dir().join(uuid) - }; - + // Query the source database for records. let query = config.to_query(); let mut conn = self.state()?.async_connect().await?; let mut stream = sqlx::query(&query).fetch(conn.acquire().await?); + // Process the rows from the query as records. let mut records = HashMap::new(); while let Some(row) = stream.next().await { let row = row?; @@ -97,28 +110,41 @@ impl Database { records.insert(id, record); } + let index_name: IndexName = name.into(); + let index_file = { + let uuid = Uuid::new_v4().to_string(); + self.indices_dir().join(uuid) + }; + let mut index = algorithm.initialize(config)?; index.fit(records)?; - // Persist the index to the file. + // Persist the index to a file. algorithm.persist_index(&index_file, index.as_ref())?; - let index_name: IndexName = name.into(); - let mut pool = self.pool.lock()?; - pool.insert(index_name.clone(), Arc::new(Mutex::new(index))); + // Insert the index into the pool for easy access. + { + let mut pool = self.pool.lock()?; + pool.insert(index_name.clone(), Arc::new(Mutex::new(index))); + } // Update db state with the new index. - let index_ref = IndexRef { algorithm, file: index_file }; - let mut state = self.state.lock()?; - state.indices.insert(index_name, index_ref); + // This closure is necessary to make sure the lock is dropped + // before persisting the state to the file. + { + let mut state = self.state.lock()?; + let index_ref = IndexRef { algorithm, file: index_file }; + state.indices.insert(index_name, index_ref); + } - drop(state); self.persist_state()?; - Ok(()) } /// Creates a new index in the database synchronously. + /// - `name`: Name of the index. + /// - `algorithm`: Indexing algorithm to use. + /// - `config`: Index data source configuration. pub fn create_index( &self, name: impl Into, @@ -128,10 +154,11 @@ impl Database { executor::block_on(self.async_create_index(name, algorithm, config)) } - /// Returns an index reference by name. + /// Returns an index reference. + /// - `name`: Index name. /// - /// This method is useful for deserializing and accessing - /// the index directly from the file based on the algorithm type. + /// This method can be used to deserialize the index directly from + /// the file and load it into memory as an index object. pub fn get_index_ref(&self, name: impl AsRef) -> Option { let state = self.state.lock().ok()?; let index_ref = state.indices.get(name.as_ref())?; @@ -148,19 +175,21 @@ impl Database { let name = name.as_ref(); let IndexRef { algorithm, file } = self.get_index_ref(name)?; + // If the index is already in the indices pool, return it. let mut pool = self.pool.lock().ok()?; if let Some(index) = pool.get(name).cloned() { return Some(index); } + // Load the index from the file and store it in the pool. + // Then, return the index as a trait object. let index = algorithm.load_index(file).ok()?; let index: Index = Arc::new(Mutex::new(index)); pool.insert(name.into(), index.clone()); Some(index) } - /// Retrieves an index and if found, returns it as a trait object. - /// Otherwise, returns a not found error. + /// Retrieves an index and returns it in a result. /// - `name`: Index name. pub fn try_get_index(&self, name: impl AsRef) -> Result { let name = name.as_ref(); @@ -196,6 +225,8 @@ impl Database { let index: Index = self.get_index(name).unwrap(); let (config, query) = { + // We wrap the index lock in a closure to make sure it's dropped + // before async functionalities are called. let index = index.lock()?; let meta = index.metadata(); let config = index.config(); @@ -207,6 +238,7 @@ impl Database { let mut conn = self.state()?.async_connect().await?; let mut stream = sqlx::query(&query).fetch(conn.acquire().await?); + // Process the rows from the database as records. let mut records = HashMap::new(); while let Some(row) = stream.next().await { let row = row?; @@ -214,6 +246,8 @@ impl Database { records.insert(id, record); } + // Update the index with new records and persist it. + // We might want to persist the index after every fit operation. let mut index = index.lock()?; index.fit(records)?; algorithm.persist_index(file, index.as_ref())?; @@ -221,6 +255,7 @@ impl Database { } /// Updates the index with new records from the source synchronously. + /// - `name`: Index name. pub fn refresh_index(&self, name: impl AsRef) -> Result<(), Error> { executor::block_on(self.async_refresh_index(name)) } @@ -230,6 +265,11 @@ impl Database { /// - `query`: Query vector. /// - `k`: Number of nearest neighbors to return. /// - `filters`: SQL-like filters to apply. + /// + /// The performance of this method depends on the indexing + /// algorithm used when creating the index. ANNS algorithms + /// may not return the exact nearest neighbors but perform + /// much faster than linear search. pub fn search_index( &self, name: impl AsRef, @@ -246,7 +286,8 @@ impl Database { /// - `name`: Index name. /// /// Some indexing algorithms may not support perfect incremental updates. - /// This method can be useful to rebalance the index. + /// This method can be useful to rebalance the index after a large number + /// of insertions or deletions. pub fn rebuild_index(&self, name: impl AsRef) -> Result<(), Error> { let name = name.as_ref(); let index: Index = self.try_get_index(name)?; @@ -260,17 +301,24 @@ impl Database { Ok(()) } - /// Deletes an index from the database given its name. + /// Deletes an index from the database. + /// - `name`: Index name. + /// + /// This method will remove the index from the pool and delete + /// the index file from the disk. Returns an error if the index + /// doesn't exist in the database. pub fn delete_index(&self, name: impl AsRef) -> Result<(), Error> { let name = name.as_ref(); - let mut state = self.state.lock()?; - let index_ref = state.indices.remove(name).ok_or_else(|| { - let code = ErrorCode::NotFound; - let message = format!("Index doesn't exist: {name}."); - Error::new(code, message) - })?; + let index_ref = { + let mut state = self.state.lock()?; + state.indices.remove(name).ok_or_else(|| { + let code = ErrorCode::NotFound; + let message = format!("Index doesn't exist: {name}."); + Error::new(code, message) + })? + }; - drop(state); + self.release_indices(vec![name])?; fs::remove_file(index_ref.file())?; self.persist_state() } @@ -288,6 +336,7 @@ impl Database { return Err(Error::new(code, message)); } + // Using the get_index method to avoid code duplication. for name in names { self.get_index(name); } @@ -308,9 +357,9 @@ impl Database { &self, names: Vec>, ) -> Result<(), Error> { + let mut pool = self.pool.lock()?; for name in names { let name = name.as_ref(); - let mut pool = self.pool.lock()?; pool.remove(name); } @@ -327,8 +376,8 @@ impl Database { /// /// This method requires a Mutex lock to be available. /// If the lock is not available, this method will be suspended. - /// When running this method with other state lock, drop - /// the lock before calling this method. + /// When running this method with other state lock, make sure + /// to release the lock before calling this method. pub fn persist_state(&self) -> Result<(), Error> { file::write_binary_file(self.state_file(), &self.state()?) } @@ -336,10 +385,12 @@ impl Database { // Write internal database methods here. impl Database { + /// Returns the file path where the state is stored. fn state_file(&self) -> PathBuf { self.root.join("odbstate") } + /// Returns the directory where the indices are stored. fn indices_dir(&self) -> PathBuf { self.root.join("indices") } @@ -353,6 +404,24 @@ pub struct DatabaseState { } impl DatabaseState { + /// Restores the database state from a file. + /// - `path`: Path to the state file. + pub fn restore(path: impl AsRef) -> Result { + file::read_binary_file(path) + } + + /// Updates the source URL of the database state. + /// - `source`: New source URL. + pub fn with_source( + &mut self, + source: impl Into, + ) -> Result<(), Error> { + let source = source.into(); + Self::validate_source(&source)?; + self.source = source; + Ok(()) + } + /// Connects to the source SQL database asynchronously. pub async fn async_connect(&self) -> Result { install_default_drivers(); @@ -365,22 +434,28 @@ impl DatabaseState { } /// Disconnects from the source SQL database asynchronously. + /// - `conn`: Database connection. pub async fn async_disconnect(conn: SourceConnection) -> Result<(), Error> { Ok(conn.close().await?) } /// Disconnects from the source SQL database. + /// - `conn`: Database connection. pub fn disconnect(conn: SourceConnection) -> Result<(), Error> { executor::block_on(Self::async_disconnect(conn)) } - /// Validates the connection to the source database successful. + /// Validates the connection to the source database. + /// + /// This method will try to connect to the source database and + /// disconnect immediately to validate the connection. If this method + /// is unable to connect, it will return an error. pub fn validate_connection(&self) -> Result<(), Error> { let conn = self.connect()?; DatabaseState::disconnect(conn) } - /// Returns the type of the source database. + /// Returns the type of the source database: /// - sqlite /// - mysql /// - postgresql @@ -392,6 +467,11 @@ impl DatabaseState { } /// Validates the data source URL. + /// + /// The source URL scheme must be one of: + /// - sqlite + /// - mysql + /// - postgresql pub fn validate_source(url: impl Into) -> Result<(), Error> { let url = url.into(); let url = url.parse::().map_err(|_| { @@ -423,10 +503,12 @@ pub struct IndexRef { } impl IndexRef { + /// Returns the type of the indexing algorithm of the index. pub fn algorithm(&self) -> &IndexAlgorithm { &self.algorithm } + /// Returns the file path where the index is stored. pub fn file(&self) -> &IndexFile { &self.file } diff --git a/src/db/mod.rs b/src/db/mod.rs index 3e1223cf..a22bd417 100644 --- a/src/db/mod.rs +++ b/src/db/mod.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use sqlx::{AnyConnection as SourceConnection, Connection}; use std::collections::HashMap; use std::fs; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; mod database; diff --git a/src/indices/idx_flat.rs b/src/indices/idx_flat.rs index 25439bb5..14b4c39c 100644 --- a/src/indices/idx_flat.rs +++ b/src/indices/idx_flat.rs @@ -56,13 +56,17 @@ impl VectorIndex for IndexFlat { /// Refitting doesn't do anything for the flat index as incremental /// insertion or deletion will directly update the data store - /// accordingly which guarantee the optimal state of the index. + /// accordingly and guarantee the optimal state of the index. fn refit(&mut self) -> Result<(), Error> { Ok(()) } /// Removes records from the index data store. /// - `record_ids`: List of record IDs to remove from the index. + /// + /// Instead of hiding the records to prevent them from showing up + /// in search results, this method removes them from the index + /// data store entirely. fn hide(&mut self, record_ids: Vec) -> Result<(), Error> { if self.data.len() < record_ids.len() { return Ok(()); @@ -103,7 +107,7 @@ impl VectorIndex for IndexFlat { } } -/// Flat index parameters. +/// Parameters for IndexFlat. #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct ParamsFlat { /// Formula used to calculate the distance between vectors. diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs index 5391df1a..9851f2e1 100644 --- a/src/indices/idx_ivfpq.rs +++ b/src/indices/idx_ivfpq.rs @@ -2,6 +2,12 @@ use super::*; use crate::utils::kmeans::{KMeans, Vectors}; use std::rc::Rc; +/// Inverted File index with Product Quantization. +/// +/// This index is a composite index that combines the Inverted File +/// algorithm with Product Quantization to achieve a balance between +/// memory usage and search speed. It is a great choice for large +/// datasets with millions of records. #[derive(Debug, Serialize, Deserialize)] pub struct IndexIVFPQ { config: SourceConfig, @@ -16,6 +22,12 @@ pub struct IndexIVFPQ { } impl IndexIVFPQ { + /// Builds the index from scratch. + /// - `records`: Dataset to build the index from. + /// + /// This method should only be called when the index is first + /// initialized or when the index needs to be rebuilt from scratch + /// because this method will overwrite the existing index data. fn build( &mut self, records: HashMap, @@ -29,9 +41,12 @@ impl IndexIVFPQ { .map(|record| &record.vector) .collect::>(); + // We use RC to avoid cloning the entire vector data as it + // can be very large and expensive to clone. let vectors: Vectors = Rc::from(vectors.as_slice()); self.create_codebook(vectors.clone()); + // Run KMeans to find the centroids for the IVF. let (centroids, assignments) = { let mut kmeans = KMeans::new( self.params.centroids, @@ -45,6 +60,8 @@ impl IndexIVFPQ { self.centroids = centroids; self.clusters = { + // Put record IDs into their respective clusters based on the + // assignments from the KMeans algorithm. let mut clusters = vec![vec![]; self.params.centroids]; let ids = records.keys().collect::>(); for (i, &cluster) in assignments.iter().enumerate() { @@ -57,6 +74,7 @@ impl IndexIVFPQ { self.metadata.count = records.len(); self.metadata.last_inserted = records.keys().max().copied(); + // Store the quantized vectors instead of the original vectors. self.data = records .into_iter() .map(|(id, record)| { @@ -69,6 +87,8 @@ impl IndexIVFPQ { Ok(()) } + /// Inserts new records into the index incrementally. + /// - `records`: New records to insert. fn insert( &mut self, records: HashMap, @@ -77,13 +97,33 @@ impl IndexIVFPQ { return Ok(()); } - let assignments = records + let vectors = records .values() - .map(|record| self.find_nearest_centroids(&record.vector, 1)[0]) + .map(|record| &record.vector) + .collect::>(); + + let assignments = vectors + .par_iter() + .map(|vector| self.find_nearest_centroids(vector, 1)[0]) .collect::>(); let ids: Vec<&RecordID> = records.keys().collect(); for (i, cluster_id) in assignments.iter().enumerate() { + // The number of records in the cluster. + let count = self.clusters[*cluster_id].len() as f32; + let new_count = count + 1.0; + + // This updates the centroid of the cluster by taking the + // weighted average of the existing centroid and the new + // vector that is being inserted. + let centroid: Vec = self.centroids[*cluster_id] + .to_vec() + .par_iter() + .zip(vectors[i].to_vec().par_iter()) + .map(|(c, v)| ((c * count) + v) / new_count) + .collect(); + + self.centroids[*cluster_id] = centroid.into(); self.clusters[*cluster_id].push(ids[i].to_owned()); } @@ -103,6 +143,12 @@ impl IndexIVFPQ { Ok(()) } + /// Creates the codebook for the Product Quantization. + /// - `vectors`: Dataset to create the codebook from. + /// + /// The size of the dataset should be large enough to cover the + /// entire vector space to ensure the codebook represents the + /// distribution of the vectors accurately. fn create_codebook(&mut self, vectors: Vectors) { for i in 0..self.params.sub_dimension { let mut subvectors = Vec::new(); @@ -125,15 +171,18 @@ impl IndexIVFPQ { self.codebook[i as usize] = centroids .par_iter() - .map(|centroid| centroid.clone().into()) + .map(|centroid| centroid.to_owned()) .collect(); } } + /// Finds the nearest centroids to a vector for cluster assignments. + /// - `vector`: Full-length vector. + /// - `k`: Number of centroids to find. fn find_nearest_centroids(&self, vector: &Vector, k: usize) -> Vec { let mut distances: Vec<(usize, f32)> = self .centroids - .iter() + .par_iter() .enumerate() .map(|(i, center)| (i, self.metric().distance(center, vector))) .collect(); @@ -142,13 +191,25 @@ impl IndexIVFPQ { distances.into_iter().take(k).map(|(i, _)| i).collect() } + /// Finds the nearest centroid in the codebook for a subvector. + /// - `part_index`: Quantization part index. + /// - `subvector`: Subvector to quantize. + /// + /// Part index is used to determine which part of the vector to + /// quantize. For example, if we have a vector with 4 dimensions and + /// we want to quantize it into two parts: + /// + /// ```text + /// [1, 2, 3, 4] => [[1, 2], [3, 4]] + /// part_index => 0 1 + /// ``` fn find_nearest_code( &self, part_index: usize, subvector: &Vector, ) -> usize { self.codebook[part_index] - .iter() + .par_iter() .map(|centroid| self.metric().distance(centroid, subvector)) .enumerate() .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) @@ -156,33 +217,39 @@ impl IndexIVFPQ { .0 } + /// Quantizes a full-length vector into a PQ vector. + /// - `vector`: Vector data. fn quantize_vector(&self, vector: &Vector) -> VectorPQ { - let sub_dimension = self.params.sub_dimension as usize; - let mut pq = Vec::with_capacity(sub_dimension); - - for i in 0..sub_dimension { - let subvector = self.get_subvector(i, vector); - let centroid_id = self.find_nearest_code(i, &subvector); - pq.push(centroid_id as u8); - } - - pq.into() + (0..self.params.sub_dimension as usize) + .into_par_iter() + .map(|i| { + let subvector = self.get_subvector(i, vector); + self.find_nearest_code(i, &subvector) as u8 + }) + .collect::>() + .into() } + /// Reconstructs a full-length vector from a PQ vector. + /// - `vector_pq`: PQ vector data. fn dequantize_vector(&self, vector_pq: &VectorPQ) -> Vector { - let mut vector = vec![]; - for (i, centroid_id) in vector_pq.0.iter().enumerate() { - let centroid = &self.codebook[i][*centroid_id as usize]; - vector.extend(centroid.to_vec()); - } - - vector.into() + vector_pq + .0 + .par_iter() + .enumerate() + .map(|(i, code_id)| self.codebook[i][*code_id as usize].to_vec()) + .flatten() + .collect::>() + .into() } + /// Extracts a subvector from a full-length vector. + /// - `part_index`: Quantization part index. + /// - `vector`: Full-length vector. fn get_subvector(&self, part_index: usize, vector: &Vector) -> Vector { let dim = vector.len() / self.params.sub_dimension as usize; - let start = part_index as usize * dim; - let end = (part_index + 1) as usize * dim; + let start = part_index * dim; + let end = (part_index + 1) * dim; let subvector = vector.0[start..end].to_vec(); Vector(subvector.into_boxed_slice()) } @@ -263,8 +330,15 @@ impl VectorIndex for IndexIVFPQ { for centroid_id in nearest_centroids { let cluster = &self.clusters[centroid_id]; for &record_id in cluster { + // Skip hidden records. + if self.metadata.hidden.contains(&record_id) { + continue; + } + let record = self.data.get(&record_id).unwrap(); let data = record.data.clone(); + + // Skip records that don't pass the filters. if !filters.apply(&data) { continue; } @@ -292,6 +366,7 @@ impl VectorIndex for IndexIVFPQ { } } +/// Parameters for IndexIVFPQ. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ParamsIVFPQ { /// Number of centroids in the IVF. diff --git a/src/indices/mod.rs b/src/indices/mod.rs index 626859bc..425e65c1 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -15,6 +15,7 @@ use std::path::Path; mod idx_flat; mod idx_ivfpq; +// Re-export indices and their parameter types. pub use idx_flat::{IndexFlat, ParamsFlat}; pub use idx_ivfpq::{IndexIVFPQ, ParamsIVFPQ}; @@ -23,7 +24,7 @@ pub type TableName = String; /// Type of SQL database used as a data source. #[allow(missing_docs)] -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum SourceType { SQLITE, POSTGRES, @@ -31,35 +32,42 @@ pub enum SourceType { } impl From<&str> for SourceType { - fn from(value: &str) -> Self { - match value { + /// Converts source URL scheme to a source type. + fn from(scheme: &str) -> Self { + match scheme { "sqlite" => SourceType::SQLITE, "postgres" | "postgresql" => SourceType::POSTGRES, "mysql" => SourceType::MYSQL, - _ => panic!("Unsupported database scheme: {value}."), + _ => panic!("Unsupported database scheme: {scheme}."), } } } /// Data source configuration for a vector index. +/// +/// The column data types used as the data source must be the following: +/// - Primary Key: Unique auto-incremented integer. +/// - Vector: Array of floats stored as JSON string or binary. +/// - Metadata: Primitive types like string, integer, float, or boolean. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SourceConfig { /// Name of the SQL table to use as data source. pub table: TableName, - /// Column name of the primary key in the data source. + /// Column name of the primary key in the source table. pub primary_key: ColumnName, - /// Column name storing the vector data. + /// Name of the column storing the vector data. pub vector: ColumnName, - /// Optional list of column names storing additional metadata. + /// Optional list of column names of additional metadata. pub metadata: Option>, /// Filter to apply to the SQL query using WHERE clause. pub filter: Option, } +#[cfg(test)] impl Default for SourceConfig { fn default() -> Self { SourceConfig { - table: "table".into(), + table: "embeddings".into(), primary_key: "id".into(), vector: "vector".into(), metadata: None, @@ -93,7 +101,7 @@ impl SourceConfig { /// Adds a list of metadata columns to the source configuration. /// - `metadata`: List of metadata column names. /// - /// OasysDB only supports primitive data types for metadata columns such as: + /// OasysDB only supports primitive data types for metadata such as: /// - String /// - Integer /// - Float @@ -107,7 +115,7 @@ impl SourceConfig { } /// Adds a filter to the source configuration. - /// - `filter`: Filter string without the WHERE keyword. + /// - `filter`: SQL filter string without the WHERE keyword. /// /// Example: /// ```text @@ -119,7 +127,10 @@ impl SourceConfig { self } - /// Returns the list of columns in the source configuration. + /// Returns the list of columns in the following order: + /// - Primary Key + /// - Vector + /// - Metadata (if any) pub fn columns(&self) -> Vec { let mut columns = vec![&self.primary_key, &self.vector]; if let Some(metadata) = &self.metadata { @@ -129,7 +140,7 @@ impl SourceConfig { columns.into_iter().map(|s| s.to_owned()).collect() } - /// Generates a SQL query string based on the configuration. + /// Generates a SQL query based on the source configuration. /// /// Example: /// ```sql @@ -149,15 +160,18 @@ impl SourceConfig { query.trim().to_string() } - /// Generates a SQL query string based on the configuration and checkpoint. - /// Instead of returning a query to fetch all records, this method returns - /// a query to fetch records from a specific RecordID. + /// Generates a SQL query string based on the configuration and a primary + /// key checkpoint. Instead of returning a query to fetch all records, + /// this method returns a query to fetch records from a specific RecordID. /// - `checkpoint`: Record ID to start the query from. pub(crate) fn to_query_after(&self, checkpoint: &RecordID) -> String { let table = &self.table; + let pk = &self.primary_key; let columns = self.columns().join(", "); - let mut filter = format!("WHERE id > {}", checkpoint.0); + // Prioritize the primary key filtering before + // joining with the optional filter. + let mut filter = format!("WHERE {pk} > {}", checkpoint.0); if let Some(string) = &self.filter { filter.push_str(&format!(" AND ({string})")); } @@ -167,6 +181,7 @@ impl SourceConfig { } /// Creates a tuple of record ID and record data from a row. + /// - `row`: SQL row containing the record data. pub(crate) fn to_record( &self, row: &AnyRow, @@ -174,6 +189,7 @@ impl SourceConfig { let id = RecordID::from_row(&self.primary_key, row)?; let vector = Vector::from_row(&self.vector, row)?; + // Parse all metadata from the row if any. let mut metadata = HashMap::new(); if let Some(metadata_columns) = &self.metadata { for column in metadata_columns { @@ -188,6 +204,10 @@ impl SourceConfig { } /// Algorithm options used to index and search vectors. +/// +/// You might want to use a different algorithm based on the size +/// of the data and the desired search performance. For example, +/// the Flat algorithm is gives good performance and recall for small datasets. #[allow(missing_docs)] #[derive(Debug, Clone, Serialize, Deserialize)] pub enum IndexAlgorithm { @@ -211,6 +231,8 @@ impl PartialEq for IndexAlgorithm { } } +impl Eq for IndexAlgorithm {} + impl IndexAlgorithm { /// Initializes a new index based on the algorithm and configuration. /// - `config`: Source configuration for the index. @@ -218,31 +240,35 @@ impl IndexAlgorithm { &self, config: SourceConfig, ) -> Result, Error> { - match self.to_owned() { - Self::Flat(params) => { - let index = IndexFlat::new(config, params)?; - Ok(Box::new(index)) - } - Self::IVFPQ(params) => { - let index = IndexIVFPQ::new(config, params)?; + macro_rules! initialize { + ($index_type:ident, $params:expr) => {{ + let index = $index_type::new(config, $params)?; Ok(Box::new(index)) - } + }}; + } + + match self.to_owned() { + Self::Flat(params) => initialize!(IndexFlat, params), + Self::IVFPQ(params) => initialize!(IndexIVFPQ, params), } } + /// Loads an index from a file based on the algorithm. + /// - `path`: Path to the file where the index is stored. pub(crate) fn load_index( &self, path: impl AsRef, ) -> Result, Error> { - match self { - Self::Flat(_) => { - let index = Self::_load_index::(path)?; + macro_rules! load { + ($index_type:ident) => {{ + let index = Self::_load_index::<$index_type>(path)?; Ok(Box::new(index)) - } - Self::IVFPQ(_) => { - let index = Self::_load_index::(path)?; - Ok(Box::new(index)) - } + }}; + } + + match self { + Self::Flat(_) => load!(IndexFlat), + Self::IVFPQ(_) => load!(IndexIVFPQ), } } @@ -254,9 +280,15 @@ impl IndexAlgorithm { path: impl AsRef, index: &dyn VectorIndex, ) -> Result<(), Error> { + macro_rules! persist { + ($index_type:ident) => {{ + Self::_persist_index::<$index_type>(path, index) + }}; + } + match self { - Self::Flat(_) => Self::_persist_index::(path, index), - Self::IVFPQ(_) => Self::_persist_index::(path, index), + Self::Flat(_) => persist!(IndexFlat), + Self::IVFPQ(_) => persist!(IndexIVFPQ), } } @@ -282,7 +314,11 @@ impl IndexAlgorithm { } } -/// Metadata about the index for operations and optimizations. +/// Metadata about the index operations. +/// +/// This information should be available to all index implementations +/// to keep track of the overall state of the index. This data is useful +/// to optimize the index operations and to provide insights about the index. #[derive(Debug, Serialize, Deserialize, Default)] pub struct IndexMetadata { /// Hidden records that will not be included in search results. @@ -324,7 +360,11 @@ impl Ord for SearchResult { } } -/// Trait for a new index implementation. +/// Trait for an index implementation. +/// +/// This trait defines the basic operations that an index should support. +/// The trait comes with default implementations for loading and persisting +/// the index to a file that should work for most cases. pub trait IndexOps: Debug + Serialize + DeserializeOwned { /// Initializes an empty index with the given configuration. /// - `config`: Source configuration for the index. @@ -345,7 +385,16 @@ pub trait IndexOps: Debug + Serialize + DeserializeOwned { } } -/// Trait for operating vector index implementations. +/// Trait for operating an index implementation. +/// +/// This trait defines operational methods to interact with the index such as +/// fitting and searching the index. Every index implementation should have the +/// following fields: +/// +/// - `config`: Data source configuration. +/// - `params`: Index-specific parameters. +/// - `metadata`: Index metadata. +/// - `data`: Records stored in the index. pub trait VectorIndex: Debug + Send + Sync { /// Returns the configuration of the index. fn config(&self) -> &SourceConfig; @@ -357,6 +406,7 @@ pub trait VectorIndex: Debug + Send + Sync { fn metadata(&self) -> &IndexMetadata; /// Trains the index based on the new records. + /// - `records`: Records to train the index on. /// /// If the index has been trained and not empty, this method /// will incrementally train the index based on the current fitting. @@ -374,6 +424,11 @@ pub trait VectorIndex: Debug + Send + Sync { /// - `query`: Query vector. /// - `k`: Number of nearest neighbors to return. /// - `filters`: Filters to apply to the search results. + /// + /// Returns search results sorted by their distance to the query. + /// The degree of the distance might vary depending on the metric + /// used but the smallest distance always means the most similar + /// record to the query. fn search( &self, query: Vector, @@ -382,6 +437,7 @@ pub trait VectorIndex: Debug + Send + Sync { ) -> Result, Error>; /// Hides certain records from the search result permanently. + /// - `record_ids`: List of record IDs to hide. fn hide(&mut self, record_ids: Vec) -> Result<(), Error>; /// Returns the index as Any type for dynamic casting. @@ -392,6 +448,10 @@ pub trait VectorIndex: Debug + Send + Sync { } /// Trait for custom index parameters. +/// +/// Every index implementation should have a custom parameter struct that +/// implements this trait. The parameters struct should also derive the +/// Serialize and Deserialize traits as it will be stored inside of the index. pub trait IndexParams: Debug + Default + Clone { /// Returns the distance metric set in the parameters. fn metric(&self) -> &DistanceMetric; @@ -400,6 +460,8 @@ pub trait IndexParams: Debug + Default + Clone { fn as_any(&self) -> &dyn Any; } +/// Downcasts the index parameters trait object to a concrete type. +/// - `params`: Index parameters trait object. pub(crate) fn downcast_params( params: impl IndexParams, ) -> Result { diff --git a/src/types/distance.rs b/src/types/distance.rs index fc9e3dec..13c6ad17 100644 --- a/src/types/distance.rs +++ b/src/types/distance.rs @@ -2,14 +2,14 @@ use crate::types::record::Vector; use serde::{Deserialize, Serialize}; use simsimd::SpatialSimilarity; -/// Distance metric used to compare vectors in the index. +/// Metric used to compare the distance between vectors in the index. #[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord)] #[derive(Serialize, Deserialize, Clone, Copy, Hash)] pub enum DistanceMetric { /// Squared [Euclidean distance](https://www.geeksforgeeks.org/euclidean-distance) /// /// The squared Euclidean distance is used to avoid the square - /// root operation thus making the computation faster. + /// root operation thus making the computation slightly faster. #[default] Euclidean, /// Cosine distance (1 - cosine similarity): @@ -28,6 +28,8 @@ impl DistanceMetric { DistanceMetric::Cosine => f32::cosine(a, b), }; + // Distances of 0 is the best distance. So, we return a large + // value for invalid values to make sure it is not selected. if dist.is_none() || dist.unwrap().is_nan() || dist.unwrap().is_infinite() diff --git a/src/types/err.rs b/src/types/err.rs index f27f70e5..8c2288de 100644 --- a/src/types/err.rs +++ b/src/types/err.rs @@ -43,6 +43,8 @@ pub struct Error { impl Error { /// Creates a new error instance. + /// - `code`: Error code. + /// - `message`: Details why the error occurred. pub fn new(code: ErrorCode, message: impl Into) -> Self { Self { code, message: message.into() } } diff --git a/src/types/filter.rs b/src/types/filter.rs index ac5ddcf0..e2a14a00 100644 --- a/src/types/filter.rs +++ b/src/types/filter.rs @@ -1,6 +1,7 @@ #![allow(missing_docs)] use crate::types::record::*; +use rayon::prelude::*; use std::collections::HashMap; /// Joined multiple filters operation with either AND or OR. @@ -15,11 +16,16 @@ pub enum Filters { } impl Filters { + /// Returns true if the record passes the filters. + /// - `data`: Record metadata to check against the filters. + /// + /// Filters of NONE type will always return true. This is useful when + /// no filters are provided and we want to include all records. pub fn apply(&self, data: &HashMap>) -> bool { match self { Filters::NONE => true, - Filters::AND(filters) => filters.iter().all(|f| f.apply(data)), - Filters::OR(filters) => filters.iter().any(|f| f.apply(data)), + Filters::AND(filters) => filters.par_iter().all(|f| f.apply(data)), + Filters::OR(filters) => filters.par_iter().any(|f| f.apply(data)), } } } @@ -62,24 +68,26 @@ pub struct Filter { } impl Filter { + /// Returns true if the data passes the filter. + /// - `data`: Data to apply the filter on. pub fn apply(&self, data: &HashMap>) -> bool { let value = match data.get(&self.column).unwrap_or(&None) { Some(value) => value, None => return false, }; - // This alias helps us cut down lines of code. - type Type = DataValue; + // This alias helps us simplify the match statement. + type T = DataValue; match (value, &self.value) { - (Type::Boolean(a), Type::Boolean(b)) => self.match_boolean(*a, *b), - (Type::Float(a), Type::Float(b)) => self.match_number(a, b), - (Type::Integer(a), Type::Integer(b)) => self.match_number(a, b), - (Type::String(a), Type::String(b)) => self.match_string(a, b), + (T::Boolean(a), T::Boolean(b)) => self.match_boolean(a, b), + (T::Float(a), T::Float(b)) => self.match_number(a, b), + (T::Integer(a), T::Integer(b)) => self.match_number(a, b), + (T::String(a), T::String(b)) => self.match_string(a, b), _ => false, } } - fn match_boolean(&self, a: bool, b: bool) -> bool { + fn match_boolean(&self, a: &bool, b: &bool) -> bool { match self.operator { FilterOperator::Equal => a == b, FilterOperator::NotEqual => a != b, @@ -116,12 +124,15 @@ impl From<&str> for Filter { } // Split the filter string into EXACTLY 3 parts. - let parts: Vec<&str> = value.splitn(3, ' ').collect(); - let parts: Vec<&str> = parts.into_iter().map(|p| p.trim()).collect(); + let parts = value + .splitn(3, ' ') + .map(|token| token.trim()) + .collect::>(); - let column = parts[0].into(); + let column = parts[0].to_string(); let operator = FilterOperator::from(parts[1]); let value = DataValue::from(parts[2]); + Filter { column, value, operator } } } diff --git a/src/types/record.rs b/src/types/record.rs index 7108af37..b7aa4204 100644 --- a/src/types/record.rs +++ b/src/types/record.rs @@ -1,21 +1,30 @@ use crate::types::err::{Error, ErrorCode}; use half::f16; +use rayon::prelude::*; use serde::{Deserialize, Serialize}; use sqlx::any::AnyRow; use sqlx::postgres::any::AnyTypeInfoKind as SQLType; use sqlx::{Row, ValueRef}; use std::collections::HashMap; -/// Column name of the SQL data source table. +/// Column name in the SQL data source table. pub type ColumnName = String; -/// ID type for records in the index from the data source. +/// Integer-based ID for each record in the index. +/// +/// For this to work properly with SQL as the data source, the column +/// containing the primary key must be a unique auto-incrementing integer. +/// Auto-incrementing integer type is important to allow the index to be +/// updated incrementally. #[derive(Debug, Clone, Copy, Serialize, Deserialize)] #[derive(Hash, PartialEq, Eq, PartialOrd, Ord, Default)] pub struct RecordID(pub u32); -/// Record type stored in the index based on the -/// configuration and data source. +/// Record data type stored in non-PQ indices. +/// +/// This data type contains the vector embedding and additional metadata +/// which depends on the source configuration. This data type is compatible +/// with the standard SQL row type. #[derive(Debug, Serialize, Deserialize)] pub struct Record { /// Vector embedding. @@ -37,7 +46,11 @@ pub struct RecordPQ { pub data: HashMap>, } -/// Vector data type stored in the index. +/// Vector data type for non-PQ indices. +/// +/// This data type uses the half-precision floating-point format +/// to store the vector data to reduce the memory footprint by half +/// compared to the standard f32 format. #[derive(Debug, Clone, Serialize, Deserialize)] #[derive(PartialEq, PartialOrd)] pub struct Vector(pub Box<[f16]>); @@ -45,6 +58,8 @@ pub struct Vector(pub Box<[f16]>); impl Vector { /// Returns the vector data as a vector of f32. pub fn to_vec(&self) -> Vec { + // Don't use parallel iterator here since it actually + // slows it down significantly. self.0.iter().map(|v| v.to_f32()).collect() } @@ -61,11 +76,16 @@ impl Vector { impl From> for Vector { fn from(value: Vec) -> Self { - Vector(value.into_iter().map(f16::from_f32).collect()) + Vector(value.into_par_iter().map(f16::from_f32).collect()) } } /// Product quantized vector data type stored in the index. +/// +/// PQ is a method used to reduce the memory footprint of the vector +/// data by dividing the vector into sub-vectors and quantizing them. +/// When performing similarity search, the quantized vectors are used +/// to reconstruct the original vector along with the codebook. #[derive(Debug, Serialize, Deserialize)] pub struct VectorPQ(pub Box<[u8]>); @@ -82,18 +102,13 @@ impl From> for VectorPQ { } } -impl From for VectorPQ { - fn from(value: Vector) -> Self { - value - .to_vec() - .iter() - .map(|v| (v * 255.0).round() as u8) - .collect::>() - .into() - } -} - /// Data types supported as metadata in the index. +/// +/// These are the corresponding SQL data types of the metadata: +/// - Boolean: BOOL +/// - Float: REAL | DOUBLE (Both converted to F32) +/// - Integer: SMALLINT | INT | BIGINT +/// - String: TEXT #[allow(missing_docs)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum DataValue { @@ -151,6 +166,8 @@ impl From for DataValue { pub(crate) trait RowOps { /// Retrieves data from the row based on the column name. + /// - `column_name`: Name of the column to retrieve data from. + /// - `row`: SQL row containing the data. fn from_row( column_name: impl Into, row: &AnyRow, @@ -164,7 +181,7 @@ impl RowOps for RecordID { column_name: impl Into, row: &AnyRow, ) -> Result { - let column_name = column_name.into(); + let column_name: String = column_name.into(); let id = row.try_get::(&column_name).map_err(|_| { let code = ErrorCode::InvalidID; let message = "Unable to get integer ID from the row."; @@ -180,7 +197,7 @@ impl RowOps for Vector { column_name: impl Into, row: &AnyRow, ) -> Result { - let column = column_name.into(); + let column: String = column_name.into(); let value = row.try_get_raw::<&str>(&column)?; let value_type = value.type_info().kind(); @@ -215,7 +232,7 @@ impl RowOps for Option { column_name: impl Into, row: &AnyRow, ) -> Result { - let column = column_name.into(); + let column: String = column_name.into(); let value = row.try_get_raw::<&str>(&column)?; let value_type = value.type_info().kind(); diff --git a/src/utils/kmeans.rs b/src/utils/kmeans.rs index 590901eb..5016c496 100644 --- a/src/utils/kmeans.rs +++ b/src/utils/kmeans.rs @@ -4,12 +4,23 @@ use rand::seq::SliceRandom; use rayon::prelude::*; use std::rc::Rc; +/// Reference of an array of vectors to be clustered. +/// +/// We use RC slice to avoid cloning the entire dataset when passing them +/// around in the KMeans model. This way, we only clone the references +/// to the dataset which is much faster and cheaper. pub type Vectors<'v> = Rc<[&'v Vector]>; #[derive(Debug, Clone, Copy, Default, Hash)] #[derive(PartialEq, Eq, PartialOrd, Ord)] pub struct ClusterID(pub u16); +/// KMeans clustering model. +/// +/// KMeans is a simple unsupervised learning algorithm that groups similar +/// data points into clusters. The algorithm works by iteratively assigning +/// each data point to the nearest centroid and then recalculating the +/// centroids of the clusters. #[derive(Debug)] pub struct KMeans { num_centroids: usize, @@ -21,6 +32,9 @@ pub struct KMeans { impl KMeans { /// Creates a new KMeans model. + /// - `num_centroids`: Number of clusters to create. + /// - `num_iterations`: Number of iterations to run the algorithm. + /// - `metric`: Distance metric to use for comparing vectors. pub fn new( num_centroids: usize, num_iterations: usize, @@ -36,14 +50,17 @@ impl KMeans { } /// Fits the KMeans model to the given vectors. + /// - `vectors`: Array of vectors to cluster. pub fn fit(&mut self, vectors: Vectors) { // Cloning the vectors is acceptable because with Rc, we are // only cloning the references, not the actual data. self.centroids = self.initialize_centroids(vectors.clone()); - let mut repeat_centroids = 0; + let mut repeat_count = 0; for _ in 0..self.num_iterations { - if repeat_centroids > 5 { + // If the centroids don't change for 5 iterations, we assume + // that the algorithm has converged and stop the iterations. + if repeat_count > 5 { break; } @@ -51,10 +68,10 @@ impl KMeans { let centroids = self.update_centroids(vectors.clone()); match self.centroids == centroids { - true => repeat_centroids += 1, + true => repeat_count += 1, false => { self.centroids = centroids; - repeat_centroids = 0; + repeat_count = 0; } } } @@ -126,6 +143,20 @@ impl KMeans { } /// Returns the cluster assignment for each vector. + /// + /// The assignment is a vector of cluster ID where each element + /// corresponds to the cluster ID of the vector at the same index. + /// For example, if we fit the vector below: + /// + /// ```text + /// [v1, v2, v3, ..., vn] + /// Assignments: [0, 0, 1, ..., m] + /// ``` + /// + /// This can be interpreted as: + /// - v1 and v2 are assigned to cluster 0. + /// - v3 is assigned to cluster 1. + /// - vn is assigned to cluster m. pub fn assignments(&self) -> &[ClusterID] { &self.assignment } From 9f1e80734093df71ec77079e2ec0cb93225112ae Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Tue, 23 Jul 2024 20:24:02 -0500 Subject: [PATCH 58/88] feat: move source config from index to indexref --- src/db/database.rs | 32 +++++++++++++++++++------------- src/indices/idx_flat.rs | 14 ++------------ src/indices/idx_ivfpq.rs | 20 +++----------------- src/indices/mod.rs | 22 +++++----------------- 4 files changed, 29 insertions(+), 59 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index 6b0a2ad9..1e61ad80 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -116,7 +116,7 @@ impl Database { self.indices_dir().join(uuid) }; - let mut index = algorithm.initialize(config)?; + let mut index = algorithm.initialize()?; index.fit(records)?; // Persist the index to a file. @@ -133,7 +133,7 @@ impl Database { // before persisting the state to the file. { let mut state = self.state.lock()?; - let index_ref = IndexRef { algorithm, file: index_file }; + let index_ref = IndexRef { algorithm, config, file: index_file }; state.indices.insert(index_name, index_ref); } @@ -173,7 +173,7 @@ impl Database { /// in the pool for future access. pub fn get_index(&self, name: impl AsRef) -> Option { let name = name.as_ref(); - let IndexRef { algorithm, file } = self.get_index_ref(name)?; + let IndexRef { algorithm, file, .. } = self.get_index_ref(name)?; // If the index is already in the indices pool, return it. let mut pool = self.pool.lock().ok()?; @@ -218,21 +218,19 @@ impl Database { })?; // Cloning is necessary here to avoid borrowing issues. - let IndexRef { algorithm, file } = index_ref.to_owned(); + let IndexRef { algorithm, file, config } = index_ref.to_owned(); // It's safe to unwrap here because we validated that index exists by // calling get_index_ref method above. let index: Index = self.get_index(name).unwrap(); - let (config, query) = { + let (query, config) = { // We wrap the index lock in a closure to make sure it's dropped // before async functionalities are called. let index = index.lock()?; let meta = index.metadata(); - let config = index.config(); - let checkpoint = meta.last_inserted.unwrap_or_default(); - (config.to_owned(), config.to_query_after(&checkpoint)) + (config.to_query_after(&checkpoint), config) }; let mut conn = self.state()?.async_connect().await?; @@ -295,9 +293,10 @@ impl Database { index.refit()?; // Unwrap is safe here because we validated that the index exists above. - let IndexRef { algorithm, file } = self.get_index_ref(name).unwrap(); - algorithm.persist_index(file, index.as_ref())?; + let IndexRef { algorithm, file, .. } = + self.get_index_ref(name).unwrap(); + algorithm.persist_index(file, index.as_ref())?; Ok(()) } @@ -498,11 +497,17 @@ impl DatabaseState { /// Details about the index and where it is stored. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct IndexRef { + config: SourceConfig, algorithm: IndexAlgorithm, file: IndexFile, } impl IndexRef { + /// Returns the source configuration of the index. + pub fn config(&self) -> &SourceConfig { + &self.config + } + /// Returns the type of the indexing algorithm of the index. pub fn algorithm(&self) -> &IndexAlgorithm { &self.algorithm @@ -678,9 +683,10 @@ mod tests { let create_table = format!( "CREATE TABLE IF NOT EXISTS {TABLE} ( - id INTEGER PRIMARY KEY, - vector JSON NOT NULL, - data INTEGER NOT NULL)" + id INTEGER PRIMARY KEY, + vector JSON NOT NULL, + data INTEGER NOT NULL + )" ); let insert_records = generate_insert_query(0, 100); diff --git a/src/indices/idx_flat.rs b/src/indices/idx_flat.rs index 14b4c39c..3cce70bc 100644 --- a/src/indices/idx_flat.rs +++ b/src/indices/idx_flat.rs @@ -7,19 +7,14 @@ use super::*; /// 10,000 records due to perfect recall and precision. #[derive(Debug, Serialize, Deserialize)] pub struct IndexFlat { - config: SourceConfig, params: ParamsFlat, metadata: IndexMetadata, data: HashMap, } impl IndexOps for IndexFlat { - fn new( - config: SourceConfig, - params: impl IndexParams, - ) -> Result { + fn new(params: impl IndexParams) -> Result { let index = IndexFlat { - config, params: downcast_params(params)?, metadata: IndexMetadata::default(), data: HashMap::new(), @@ -30,10 +25,6 @@ impl IndexOps for IndexFlat { } impl VectorIndex for IndexFlat { - fn config(&self) -> &SourceConfig { - &self.config - } - fn metric(&self) -> &DistanceMetric { &self.params.metric } @@ -130,9 +121,8 @@ mod tests { #[test] fn test_flat_index() { - let config = SourceConfig::default(); let params = ParamsFlat::default(); - let mut index = IndexFlat::new(config, params).unwrap(); + let mut index = IndexFlat::new(params).unwrap(); index_tests::populate_index(&mut index); index_tests::test_basic_search(&index); diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs index 9851f2e1..cc4dc8b0 100644 --- a/src/indices/idx_ivfpq.rs +++ b/src/indices/idx_ivfpq.rs @@ -10,7 +10,6 @@ use std::rc::Rc; /// datasets with millions of records. #[derive(Debug, Serialize, Deserialize)] pub struct IndexIVFPQ { - config: SourceConfig, params: ParamsIVFPQ, metadata: IndexMetadata, data: HashMap, @@ -256,16 +255,12 @@ impl IndexIVFPQ { } impl IndexOps for IndexIVFPQ { - fn new( - config: SourceConfig, - params: impl IndexParams, - ) -> Result { + fn new(params: impl IndexParams) -> Result { let params = downcast_params::(params)?; let codebook = vec![vec![]; params.sub_dimension as usize]; let clusters = vec![vec![]; params.centroids]; let index = IndexIVFPQ { - config, params, metadata: IndexMetadata::default(), data: HashMap::new(), @@ -280,10 +275,6 @@ impl IndexOps for IndexIVFPQ { } impl VectorIndex for IndexIVFPQ { - fn config(&self) -> &SourceConfig { - &self.config - } - fn metric(&self) -> &DistanceMetric { &self.params.metric } @@ -431,7 +422,7 @@ mod tests { ..Default::default() }; - let mut index = create_test_index(params); + let mut index = IndexIVFPQ::new(params).unwrap(); index.create_codebook(vectors); let encoded = index.quantize_vector(&data[0]); @@ -447,14 +438,9 @@ mod tests { ..Default::default() }; - let mut index = create_test_index(params); + let mut index = IndexIVFPQ::new(params).unwrap(); index_tests::populate_index(&mut index); index_tests::test_basic_search(&index); index_tests::test_advanced_search(&index); } - - fn create_test_index(params: ParamsIVFPQ) -> IndexIVFPQ { - let config = SourceConfig::default(); - IndexIVFPQ::new(config, params).unwrap() - } } diff --git a/src/indices/mod.rs b/src/indices/mod.rs index 425e65c1..2364b001 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -234,15 +234,11 @@ impl PartialEq for IndexAlgorithm { impl Eq for IndexAlgorithm {} impl IndexAlgorithm { - /// Initializes a new index based on the algorithm and configuration. - /// - `config`: Source configuration for the index. - pub(crate) fn initialize( - &self, - config: SourceConfig, - ) -> Result, Error> { + /// Initializes a new index based on the algorithm and its parameters. + pub(crate) fn initialize(&self) -> Result, Error> { macro_rules! initialize { ($index_type:ident, $params:expr) => {{ - let index = $index_type::new(config, $params)?; + let index = $index_type::new($params)?; Ok(Box::new(index)) }}; } @@ -366,13 +362,9 @@ impl Ord for SearchResult { /// The trait comes with default implementations for loading and persisting /// the index to a file that should work for most cases. pub trait IndexOps: Debug + Serialize + DeserializeOwned { - /// Initializes an empty index with the given configuration. - /// - `config`: Source configuration for the index. + /// Initializes an empty index with the given parameters. /// - `params`: Index specific parameters. - fn new( - config: SourceConfig, - params: impl IndexParams, - ) -> Result; + fn new(params: impl IndexParams) -> Result; /// Reads and deserializes the index from a file. fn load(path: impl AsRef) -> Result { @@ -391,14 +383,10 @@ pub trait IndexOps: Debug + Serialize + DeserializeOwned { /// fitting and searching the index. Every index implementation should have the /// following fields: /// -/// - `config`: Data source configuration. /// - `params`: Index-specific parameters. /// - `metadata`: Index metadata. /// - `data`: Records stored in the index. pub trait VectorIndex: Debug + Send + Sync { - /// Returns the configuration of the index. - fn config(&self) -> &SourceConfig; - /// Returns the distance metric used by the index. fn metric(&self) -> &DistanceMetric; From 9117f53e60b331b900c55e98d0c8fe42f6e276fc Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Tue, 23 Jul 2024 20:36:32 -0500 Subject: [PATCH 59/88] feat: add insufficient data check for ivfpq --- src/indices/idx_ivfpq.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs index cc4dc8b0..1bd85405 100644 --- a/src/indices/idx_ivfpq.rs +++ b/src/indices/idx_ivfpq.rs @@ -35,6 +35,14 @@ impl IndexIVFPQ { return Ok(()); } + // Ensure that the number of records is good enough to build + // the index compared to the parameters. + if records.len() < self.params.centroids * 5 { + let code = ErrorCode::InvalidSource; + let message = "Dataset is too small to build the index properly."; + return Err(Error::new(code, message)); + } + let vectors = records .values() .map(|record| &record.vector) From 2e645baa3e3ec640b17ef50e455694f4063a6199 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Wed, 24 Jul 2024 15:26:44 -0500 Subject: [PATCH 60/88] feat: simplify indices methods --- src/db/database.rs | 39 +----- src/indices/idx_flat.rs | 41 +++--- src/indices/idx_ivfpq.rs | 267 +++++++++++++++++---------------------- src/indices/mod.rs | 50 ++++---- src/types/err.rs | 1 + 5 files changed, 165 insertions(+), 233 deletions(-) diff --git a/src/db/database.rs b/src/db/database.rs index 1e61ad80..a2248cc9 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -117,7 +117,7 @@ impl Database { }; let mut index = algorithm.initialize()?; - index.fit(records)?; + index.build(records)?; // Persist the index to a file. algorithm.persist_index(&index_file, index.as_ref())?; @@ -247,7 +247,7 @@ impl Database { // Update the index with new records and persist it. // We might want to persist the index after every fit operation. let mut index = index.lock()?; - index.fit(records)?; + index.insert(records)?; algorithm.persist_index(file, index.as_ref())?; Ok(()) } @@ -280,26 +280,6 @@ impl Database { index.search(query.into(), k, filters.into()) } - /// Rebuilds the index from the existing records in the index. - /// - `name`: Index name. - /// - /// Some indexing algorithms may not support perfect incremental updates. - /// This method can be useful to rebalance the index after a large number - /// of insertions or deletions. - pub fn rebuild_index(&self, name: impl AsRef) -> Result<(), Error> { - let name = name.as_ref(); - let index: Index = self.try_get_index(name)?; - let mut index = index.lock()?; - index.refit()?; - - // Unwrap is safe here because we validated that the index exists above. - let IndexRef { algorithm, file, .. } = - self.get_index_ref(name).unwrap(); - - algorithm.persist_index(file, index.as_ref())?; - Ok(()) - } - /// Deletes an index from the database. /// - `name`: Index name. /// @@ -542,7 +522,7 @@ mod tests { let index = index.lock()?; let metadata = index.metadata(); - assert_eq!(metadata.count, 100); + assert_eq!(index.len(), 100); assert_eq!(metadata.last_inserted, Some(RecordID(100))); Ok(()) } @@ -559,7 +539,7 @@ mod tests { let index = index.lock()?; let metadata = index.metadata(); - assert_eq!(metadata.count, 110); + assert_eq!(index.len(), 110); assert_eq!(metadata.last_inserted, Some(RecordID(110))); Ok(()) } @@ -587,17 +567,6 @@ mod tests { assert_eq!(results[0].id, RecordID(51)); } - #[test] - fn test_database_rebuild_index() -> Result<(), Error> { - let db = create_test_database()?; - db.rebuild_index(TEST_INDEX)?; - - let index: Index = db.try_get_index(TEST_INDEX)?; - let index = index.lock()?; - assert_eq!(index.metadata().count, 100); - Ok(()) - } - #[test] fn test_database_delete_index() { let db = create_test_database().unwrap(); diff --git a/src/indices/idx_flat.rs b/src/indices/idx_flat.rs index 3cce70bc..1c86aa3f 100644 --- a/src/indices/idx_flat.rs +++ b/src/indices/idx_flat.rs @@ -33,38 +33,29 @@ impl VectorIndex for IndexFlat { &self.metadata } - fn fit(&mut self, records: HashMap) -> Result<(), Error> { + fn build( + &mut self, + records: HashMap, + ) -> Result<(), Error> { + self.metadata.built = true; + self.insert(records) + } + + fn insert( + &mut self, + records: HashMap, + ) -> Result<(), Error> { if records.is_empty() { return Ok(()); } self.metadata.last_inserted = records.keys().max().copied(); - self.metadata.count += records.len(); self.data.par_extend(records); - Ok(()) } - /// Refitting doesn't do anything for the flat index as incremental - /// insertion or deletion will directly update the data store - /// accordingly and guarantee the optimal state of the index. - fn refit(&mut self) -> Result<(), Error> { - Ok(()) - } - - /// Removes records from the index data store. - /// - `record_ids`: List of record IDs to remove from the index. - /// - /// Instead of hiding the records to prevent them from showing up - /// in search results, this method removes them from the index - /// data store entirely. - fn hide(&mut self, record_ids: Vec) -> Result<(), Error> { - if self.data.len() < record_ids.len() { - return Ok(()); - } - - self.data.retain(|id, _| !record_ids.contains(id)); - self.metadata.count = self.data.len(); + fn delete(&mut self, ids: Vec) -> Result<(), Error> { + self.data.retain(|id, _| !ids.contains(id)); Ok(()) } @@ -93,6 +84,10 @@ impl VectorIndex for IndexFlat { Ok(results.into_sorted_vec()) } + fn len(&self) -> usize { + self.data.len() + } + fn as_any(&self) -> &dyn Any { self } diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs index 1bd85405..f8f47ab6 100644 --- a/src/indices/idx_ivfpq.rs +++ b/src/indices/idx_ivfpq.rs @@ -21,135 +21,6 @@ pub struct IndexIVFPQ { } impl IndexIVFPQ { - /// Builds the index from scratch. - /// - `records`: Dataset to build the index from. - /// - /// This method should only be called when the index is first - /// initialized or when the index needs to be rebuilt from scratch - /// because this method will overwrite the existing index data. - fn build( - &mut self, - records: HashMap, - ) -> Result<(), Error> { - if records.is_empty() { - return Ok(()); - } - - // Ensure that the number of records is good enough to build - // the index compared to the parameters. - if records.len() < self.params.centroids * 5 { - let code = ErrorCode::InvalidSource; - let message = "Dataset is too small to build the index properly."; - return Err(Error::new(code, message)); - } - - let vectors = records - .values() - .map(|record| &record.vector) - .collect::>(); - - // We use RC to avoid cloning the entire vector data as it - // can be very large and expensive to clone. - let vectors: Vectors = Rc::from(vectors.as_slice()); - self.create_codebook(vectors.clone()); - - // Run KMeans to find the centroids for the IVF. - let (centroids, assignments) = { - let mut kmeans = KMeans::new( - self.params.centroids, - self.params.num_iterations, - self.metric().to_owned(), - ); - - kmeans.fit(vectors.clone()); - (kmeans.centroids().to_vec(), kmeans.assignments().to_vec()) - }; - - self.centroids = centroids; - self.clusters = { - // Put record IDs into their respective clusters based on the - // assignments from the KMeans algorithm. - let mut clusters = vec![vec![]; self.params.centroids]; - let ids = records.keys().collect::>(); - for (i, &cluster) in assignments.iter().enumerate() { - clusters[cluster.0 as usize].push(ids[i].to_owned()); - } - - clusters - }; - - self.metadata.count = records.len(); - self.metadata.last_inserted = records.keys().max().copied(); - - // Store the quantized vectors instead of the original vectors. - self.data = records - .into_iter() - .map(|(id, record)| { - let vector = self.quantize_vector(&record.vector); - let data = record.data; - (id, RecordPQ { vector, data }) - }) - .collect(); - - Ok(()) - } - - /// Inserts new records into the index incrementally. - /// - `records`: New records to insert. - fn insert( - &mut self, - records: HashMap, - ) -> Result<(), Error> { - if records.is_empty() { - return Ok(()); - } - - let vectors = records - .values() - .map(|record| &record.vector) - .collect::>(); - - let assignments = vectors - .par_iter() - .map(|vector| self.find_nearest_centroids(vector, 1)[0]) - .collect::>(); - - let ids: Vec<&RecordID> = records.keys().collect(); - for (i, cluster_id) in assignments.iter().enumerate() { - // The number of records in the cluster. - let count = self.clusters[*cluster_id].len() as f32; - let new_count = count + 1.0; - - // This updates the centroid of the cluster by taking the - // weighted average of the existing centroid and the new - // vector that is being inserted. - let centroid: Vec = self.centroids[*cluster_id] - .to_vec() - .par_iter() - .zip(vectors[i].to_vec().par_iter()) - .map(|(c, v)| ((c * count) + v) / new_count) - .collect(); - - self.centroids[*cluster_id] = centroid.into(); - self.clusters[*cluster_id].push(ids[i].to_owned()); - } - - self.metadata.count += records.len(); - self.metadata.last_inserted = records.keys().max().copied(); - - let records: HashMap = records - .into_par_iter() - .map(|(id, record)| { - let vector = self.quantize_vector(&record.vector); - let data = record.data; - (id, RecordPQ { vector, data }) - }) - .collect(); - - self.data.par_extend(records); - Ok(()) - } - /// Creates the codebook for the Product Quantization. /// - `vectors`: Dataset to create the codebook from. /// @@ -291,27 +162,127 @@ impl VectorIndex for IndexIVFPQ { &self.metadata } - fn fit(&mut self, records: HashMap) -> Result<(), Error> { - match self.metadata.count { - 0 => self.build(records), - _ => self.insert(records), - } + fn build( + &mut self, + records: HashMap, + ) -> Result<(), Error> { + let vectors = records + .values() + .map(|record| &record.vector) + .collect::>(); + + // We use RC to avoid cloning the entire vector data as it + // can be very large and expensive to clone. + let vectors: Vectors = Rc::from(vectors.as_slice()); + self.create_codebook(vectors.clone()); + + // Run KMeans to find the centroids for the IVF. + let (centroids, assignments) = { + let mut kmeans = KMeans::new( + self.params.centroids, + self.params.num_iterations, + self.metric().to_owned(), + ); + + kmeans.fit(vectors.clone()); + (kmeans.centroids().to_vec(), kmeans.assignments().to_vec()) + }; + + self.centroids = centroids; + self.clusters = { + // Put record IDs into their respective clusters based on the + // assignments from the KMeans algorithm. + let mut clusters = vec![vec![]; self.params.centroids]; + let ids = records.keys().collect::>(); + for (i, &cluster) in assignments.iter().enumerate() { + clusters[cluster.0 as usize].push(ids[i].to_owned()); + } + + clusters + }; + + self.metadata.last_inserted = records.keys().max().copied(); + self.metadata.built = true; + + // Store the quantized vectors instead of the original vectors. + self.data = records + .into_iter() + .map(|(id, record)| { + let vector = self.quantize_vector(&record.vector); + let data = record.data; + (id, RecordPQ { vector, data }) + }) + .collect(); + + Ok(()) } - fn refit(&mut self) -> Result<(), Error> { - self.data.retain(|id, _| !self.metadata.hidden.contains(id)); + fn insert( + &mut self, + records: HashMap, + ) -> Result<(), Error> { + if records.is_empty() { + return Ok(()); + } + + if !self.metadata().built { + let code = ErrorCode::RequestError; + let message = "Unable to insert records into an unbuilt index."; + return Err(Error::new(code, message)); + } - let records = self - .data + let vectors = records + .values() + .map(|record| &record.vector) + .collect::>(); + + let assignments = vectors .par_iter() + .map(|vector| self.find_nearest_centroids(vector, 1)[0]) + .collect::>(); + + let ids: Vec<&RecordID> = records.keys().collect(); + for (i, cluster_id) in assignments.iter().enumerate() { + // The number of records in the cluster. + let count = self.clusters[*cluster_id].len() as f32; + let new_count = count + 1.0; + + // This updates the centroid of the cluster by taking the + // weighted average of the existing centroid and the new + // vector that is being inserted. + let centroid: Vec = self.centroids[*cluster_id] + .to_vec() + .par_iter() + .zip(vectors[i].to_vec().par_iter()) + .map(|(c, v)| ((c * count) + v) / new_count) + .collect(); + + self.centroids[*cluster_id] = centroid.into(); + self.clusters[*cluster_id].push(ids[i].to_owned()); + } + + self.metadata.last_inserted = records.keys().max().copied(); + + let records: HashMap = records + .into_par_iter() .map(|(id, record)| { - let vector = self.dequantize_vector(&record.vector); - let data = record.data.clone(); - (*id, Record { vector, data }) + let vector = self.quantize_vector(&record.vector); + let data = record.data; + (id, RecordPQ { vector, data }) }) .collect(); - self.build(records) + self.data.par_extend(records); + Ok(()) + } + + fn delete(&mut self, ids: Vec) -> Result<(), Error> { + self.data.retain(|id, _| !ids.contains(id)); + self.clusters.par_iter_mut().for_each(|cluster| { + cluster.retain(|id| !ids.contains(id)); + }); + + Ok(()) } fn search( @@ -329,15 +300,8 @@ impl VectorIndex for IndexIVFPQ { for centroid_id in nearest_centroids { let cluster = &self.clusters[centroid_id]; for &record_id in cluster { - // Skip hidden records. - if self.metadata.hidden.contains(&record_id) { - continue; - } - let record = self.data.get(&record_id).unwrap(); let data = record.data.clone(); - - // Skip records that don't pass the filters. if !filters.apply(&data) { continue; } @@ -355,9 +319,8 @@ impl VectorIndex for IndexIVFPQ { Ok(results.into_sorted_vec()) } - fn hide(&mut self, record_ids: Vec) -> Result<(), Error> { - self.metadata.hidden.extend(record_ids); - Ok(()) + fn len(&self) -> usize { + self.data.len() } fn as_any(&self) -> &dyn Any { diff --git a/src/indices/mod.rs b/src/indices/mod.rs index 2364b001..01e120f4 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -317,15 +317,17 @@ impl IndexAlgorithm { /// to optimize the index operations and to provide insights about the index. #[derive(Debug, Serialize, Deserialize, Default)] pub struct IndexMetadata { - /// Hidden records that will not be included in search results. - pub hidden: Vec, - /// Last inserted data reference used for incremental insertion. + /// Status whether the index has been built or not. + pub built: bool, + /// Last inserted record reference used for incremental insertion. pub last_inserted: Option, - /// Number of records in the index. - pub count: usize, } /// Nearest neighbor search result. +/// +/// This struct contains the additional metadata of the records +/// which is often used for post-search operations such as using +/// the metadata as a context for RAG (Retrieval Augmented Generation). #[derive(Debug)] pub struct SearchResult { /// ID of the record in the data source. @@ -393,20 +395,23 @@ pub trait VectorIndex: Debug + Send + Sync { /// Returns metadata about the index. fn metadata(&self) -> &IndexMetadata; - /// Trains the index based on the new records. - /// - `records`: Records to train the index on. - /// - /// If the index has been trained and not empty, this method - /// will incrementally train the index based on the current fitting. - /// Otherwise, this method will train the index from scratch like normal. - fn fit(&mut self, records: HashMap) -> Result<(), Error>; + /// Builds the index from scratch based on the records. + /// - `records`: Records to build the index on. + fn build( + &mut self, + records: HashMap, + ) -> Result<(), Error>; - /// Resets the index and re-trains it on the non-hidden records. - /// - /// Incremental fitting is not as optimal as fitting from scratch for - /// some indexing algorithms. This method could be useful to re-balance - /// the index after a certain threshold of incremental fitting. - fn refit(&mut self) -> Result<(), Error>; + /// Inserts new records into the index incrementally. + /// - `records`: Records to insert into the index. + fn insert( + &mut self, + records: HashMap, + ) -> Result<(), Error>; + + /// Deletes records from the index data store. + /// - `ids`: List of record IDs to delete from the index. + fn delete(&mut self, ids: Vec) -> Result<(), Error>; /// Searches for the nearest neighbors of the query vector. /// - `query`: Query vector. @@ -415,7 +420,7 @@ pub trait VectorIndex: Debug + Send + Sync { /// /// Returns search results sorted by their distance to the query. /// The degree of the distance might vary depending on the metric - /// used but the smallest distance always means the most similar + /// used but the smallest distance always means the most relevant /// record to the query. fn search( &self, @@ -424,9 +429,8 @@ pub trait VectorIndex: Debug + Send + Sync { filters: Filters, ) -> Result, Error>; - /// Hides certain records from the search result permanently. - /// - `record_ids`: List of record IDs to hide. - fn hide(&mut self, record_ids: Vec) -> Result<(), Error>; + /// Returns the number of records in the index. + fn len(&self) -> usize; /// Returns the index as Any type for dynamic casting. /// @@ -502,7 +506,7 @@ mod index_tests { records.insert(id, record); } - index.fit(records).unwrap(); + index.build(records).unwrap(); } pub fn test_basic_search(index: &impl VectorIndex) { diff --git a/src/types/err.rs b/src/types/err.rs index 8c2288de..20cdb1c5 100644 --- a/src/types/err.rs +++ b/src/types/err.rs @@ -23,6 +23,7 @@ pub enum ErrorCode { // Other generic errors. InternalError, + RequestError, NotFound, // External error types. From bbf8e8f800f0afa793be23c8484965671f7fa87a Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Wed, 24 Jul 2024 15:48:55 -0500 Subject: [PATCH 61/88] feat: add optional simd feature --- Cargo.lock | 151 ++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 11 ++- src/types/distance.rs | 31 ++++++++- 3 files changed, 190 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e983c17b..d975a0a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + [[package]] name = "ahash" version = "0.8.11" @@ -135,6 +141,15 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-deque" version = "0.8.5" @@ -185,6 +200,36 @@ dependencies = [ "typenum", ] +[[package]] +name = "curl" +version = "0.4.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e2161dd6eba090ff1594084e95fd67aeccf04382ffea77999ea94ed42ec67b6" +dependencies = [ + "curl-sys", + "libc", + "openssl-probe", + "openssl-sys", + "schannel", + "socket2", + "windows-sys 0.52.0", +] + +[[package]] +name = "curl-sys" +version = "0.4.73+curl-8.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "450ab250ecf17227c39afb9a2dd9261dc0035cb80f2612472fc0c4aac2dcb84d" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", + "windows-sys 0.52.0", +] + [[package]] name = "der" version = "0.7.9" @@ -262,6 +307,28 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +[[package]] +name = "filetime" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.4.1", + "windows-sys 0.52.0", +] + +[[package]] +name = "flate2" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "flume" version = "0.11.0" @@ -533,6 +600,18 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "libz-sys" +version = "1.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c15da26e5af7e25c90b37a2d75cdbf940cf4a55316de9d84c679c9b8bfabf82e" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -577,6 +656,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" +[[package]] +name = "miniz_oxide" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" +dependencies = [ + "adler", +] + [[package]] name = "nom" version = "7.1.3" @@ -639,6 +727,9 @@ name = "oasysdb" version = "0.7.0" dependencies = [ "bincode", + "byteorder", + "curl", + "flate2", "futures", "half", "rand", @@ -647,6 +738,7 @@ dependencies = [ "serde_json", "simsimd", "sqlx", + "tar", "url", "uuid", ] @@ -657,6 +749,24 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-sys" +version = "0.9.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "parking_lot" version = "0.12.3" @@ -871,6 +981,15 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -964,6 +1083,16 @@ version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "spin" version = "0.9.8" @@ -1223,6 +1352,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tar" +version = "0.4.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb797dad5fb5b76fcf519e702f4a589483b5ef06567f160c392832c1f5e44909" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "tempfile" version = "3.10.1" @@ -1548,6 +1688,17 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "xattr" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f" +dependencies = [ + "libc", + "linux-raw-sys", + "rustix", +] + [[package]] name = "zerocopy" version = "0.7.35" diff --git a/Cargo.toml b/Cargo.toml index 1d447df8..56c06d26 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,8 +23,8 @@ futures = "0.3.30" rand = "0.8.5" # Parallelism. +simsimd = { version = "4.4.0", optional = true } rayon = "1.10.0" -simsimd = "4.4.0" # Serialization. serde = { version = "1.0.203", features = ["derive"] } @@ -36,6 +36,15 @@ version = "0.7.4" default-features = false features = ["all-databases"] +[dev-dependencies] +byteorder = "1.5.0" +curl = "0.4.44" +flate2 = "1.0.30" +tar = "0.4.41" + +[features] +simd = ["dep:simsimd"] + [profile.release] lto = true opt-level = "z" diff --git a/src/types/distance.rs b/src/types/distance.rs index 13c6ad17..32fe8725 100644 --- a/src/types/distance.rs +++ b/src/types/distance.rs @@ -1,5 +1,7 @@ use crate::types::record::Vector; use serde::{Deserialize, Serialize}; + +#[cfg(feature = "simd")] use simsimd::SpatialSimilarity; /// Metric used to compare the distance between vectors in the index. @@ -24,8 +26,8 @@ impl DistanceMetric { let b = &b.to_vec(); let dist = match self { - DistanceMetric::Euclidean => f32::sqeuclidean(a, b), - DistanceMetric::Cosine => f32::cosine(a, b), + DistanceMetric::Euclidean => Self::sqeuclidean(a, b), + DistanceMetric::Cosine => Self::cosine(a, b), }; // Distances of 0 is the best distance. So, we return a large @@ -39,6 +41,31 @@ impl DistanceMetric { dist.unwrap() as f32 } + + fn sqeuclidean(a: &[f32], b: &[f32]) -> Option { + #[cfg(feature = "simd")] + return f32::sqeuclidean(a, b); + + let dist = a + .iter() + .zip(b.iter()) + .map(|(a, b)| (a - b).powi(2) as f64) + .sum::(); + + Some(dist) + } + + fn cosine(a: &[f32], b: &[f32]) -> Option { + #[cfg(feature = "simd")] + return f32::cosine(a, b); + + let dot = a.iter().zip(b.iter()).map(|(a, b)| a * b).sum::(); + let norm_a = a.iter().map(|x| x.powi(2)).sum::().sqrt(); + let norm_b = b.iter().map(|x| x.powi(2)).sum::().sqrt(); + + let dist = 1.0 - dot / (norm_a * norm_b); + Some(dist as f64) + } } #[cfg(test)] From 493875d5d44016d3a741bf9da1b699ae9f05c0ff Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Thu, 25 Jul 2024 13:09:30 -0500 Subject: [PATCH 62/88] feat: add integration test --- src/db/database.rs | 13 +-- src/indices/idx_ivfpq.rs | 154 +++++++++++++++++------------ src/indices/mod.rs | 25 +++-- src/types/distance.rs | 2 + src/utils/kmeans.rs | 11 ++- tests/common/mod.rs | 203 +++++++++++++++++++++++++++++++++++++++ tests/test_index.rs | 55 +++++++++++ 7 files changed, 385 insertions(+), 78 deletions(-) create mode 100644 tests/common/mod.rs create mode 100644 tests/test_index.rs diff --git a/src/db/database.rs b/src/db/database.rs index a2248cc9..31b48977 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -596,7 +596,7 @@ mod tests { } fn create_test_database() -> Result { - let path = PathBuf::from("odb_data"); + let path = PathBuf::from("odb_utest"); if path.try_exists()? { fs::remove_dir_all(&path)?; } @@ -659,15 +659,16 @@ mod tests { ); let insert_records = generate_insert_query(0, 100); + let drop_table = format!("DROP TABLE IF EXISTS {TABLE}"); - conn.execute("DROP TABLE IF EXISTS embeddings").await?; + conn.execute(drop_table.as_str()).await?; conn.execute(create_table.as_str()).await?; conn.execute(insert_records.as_str()).await?; - let count = conn - .fetch_one("SELECT COUNT(*) FROM embeddings") - .await? - .get::(0); + let count = { + let query = format!("SELECT COUNT(*) FROM {TABLE}"); + conn.fetch_one(query.as_str()).await?.get::(0) + }; assert_eq!(count, 100); Ok(()) diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs index f8f47ab6..3bc993de 100644 --- a/src/indices/idx_ivfpq.rs +++ b/src/indices/idx_ivfpq.rs @@ -1,5 +1,7 @@ use super::*; -use crate::utils::kmeans::{KMeans, Vectors}; +use crate::utils::kmeans::{ClusterID, KMeans, Vectors}; +use rand::seq::IteratorRandom; +use std::cmp::Ordering; use std::rc::Rc; /// Inverted File index with Product Quantization. @@ -38,7 +40,7 @@ impl IndexIVFPQ { let centroids = { let mut kmeans = KMeans::new( self.params.sub_centroids as usize, - self.params.num_iterations, + self.params.max_iterations, self.params.metric, ); @@ -57,16 +59,29 @@ impl IndexIVFPQ { /// Finds the nearest centroids to a vector for cluster assignments. /// - `vector`: Full-length vector. /// - `k`: Number of centroids to find. - fn find_nearest_centroids(&self, vector: &Vector, k: usize) -> Vec { - let mut distances: Vec<(usize, f32)> = self - .centroids - .par_iter() - .enumerate() - .map(|(i, center)| (i, self.metric().distance(center, vector))) - .collect(); + fn find_nearest_centroids( + &self, + vector: &Vector, + k: usize, + ) -> Vec { + let mut centroids = BinaryHeap::new(); + for (i, center) in self.centroids.iter().enumerate() { + let id = ClusterID(i as u16); + let distance = self.metric().distance(center, vector); + + let centroid = NearestCentroid { id, distance }; + centroids.push(centroid); - distances.sort_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()); - distances.into_iter().take(k).map(|(i, _)| i).collect() + if centroids.len() > k { + centroids.pop(); + } + } + + centroids + .into_sorted_vec() + .into_iter() + .map(|centroid| centroid.id) + .collect() } /// Finds the nearest centroid in the codebook for a subvector. @@ -139,6 +154,13 @@ impl IndexOps for IndexIVFPQ { let codebook = vec![vec![]; params.sub_dimension as usize]; let clusters = vec![vec![]; params.centroids]; + // Validate the sampling parameter. + if params.sampling <= 0.0 || params.sampling > 1.0 { + let code = ErrorCode::RequestError; + let message = "Sampling must be between 0.0 and 1.0."; + return Err(Error::new(code, message)); + } + let index = IndexIVFPQ { params, metadata: IndexMetadata::default(), @@ -166,9 +188,13 @@ impl VectorIndex for IndexIVFPQ { &mut self, records: HashMap, ) -> Result<(), Error> { + let mut rng = rand::thread_rng(); + let sample = (records.len() as f32 * self.params.sampling) as usize; let vectors = records .values() - .map(|record| &record.vector) + .choose_multiple(&mut rng, sample) + .par_iter() + .map(|&record| &record.vector) .collect::>(); // We use RC to avoid cloning the entire vector data as it @@ -177,43 +203,20 @@ impl VectorIndex for IndexIVFPQ { self.create_codebook(vectors.clone()); // Run KMeans to find the centroids for the IVF. - let (centroids, assignments) = { + let centroids = { let mut kmeans = KMeans::new( self.params.centroids, - self.params.num_iterations, + self.params.max_iterations, self.metric().to_owned(), ); kmeans.fit(vectors.clone()); - (kmeans.centroids().to_vec(), kmeans.assignments().to_vec()) + kmeans.centroids().to_vec() }; self.centroids = centroids; - self.clusters = { - // Put record IDs into their respective clusters based on the - // assignments from the KMeans algorithm. - let mut clusters = vec![vec![]; self.params.centroids]; - let ids = records.keys().collect::>(); - for (i, &cluster) in assignments.iter().enumerate() { - clusters[cluster.0 as usize].push(ids[i].to_owned()); - } - - clusters - }; - - self.metadata.last_inserted = records.keys().max().copied(); self.metadata.built = true; - - // Store the quantized vectors instead of the original vectors. - self.data = records - .into_iter() - .map(|(id, record)| { - let vector = self.quantize_vector(&record.vector); - let data = record.data; - (id, RecordPQ { vector, data }) - }) - .collect(); - + self.insert(records)?; Ok(()) } @@ -231,34 +234,26 @@ impl VectorIndex for IndexIVFPQ { return Err(Error::new(code, message)); } - let vectors = records - .values() - .map(|record| &record.vector) - .collect::>(); + for (id, record) in records.iter() { + let vector = &record.vector; + let cid = self.find_nearest_centroids(vector, 1)[0].to_usize(); - let assignments = vectors - .par_iter() - .map(|vector| self.find_nearest_centroids(vector, 1)[0]) - .collect::>(); - - let ids: Vec<&RecordID> = records.keys().collect(); - for (i, cluster_id) in assignments.iter().enumerate() { // The number of records in the cluster. - let count = self.clusters[*cluster_id].len() as f32; + let count = self.clusters[cid].len() as f32; let new_count = count + 1.0; // This updates the centroid of the cluster by taking the // weighted average of the existing centroid and the new // vector that is being inserted. - let centroid: Vec = self.centroids[*cluster_id] + let centroid: Vec = self.centroids[cid] .to_vec() .par_iter() - .zip(vectors[i].to_vec().par_iter()) + .zip(vector.to_vec().par_iter()) .map(|(c, v)| ((c * count) + v) / new_count) .collect(); - self.centroids[*cluster_id] = centroid.into(); - self.clusters[*cluster_id].push(ids[i].to_owned()); + self.centroids[cid] = centroid.into(); + self.clusters[cid].push(id.to_owned()); } self.metadata.last_inserted = records.keys().max().copied(); @@ -298,7 +293,7 @@ impl VectorIndex for IndexIVFPQ { let mut results = BinaryHeap::new(); for centroid_id in nearest_centroids { - let cluster = &self.clusters[centroid_id]; + let cluster = &self.clusters[centroid_id.to_usize()]; for &record_id in cluster { let record = self.data.get(&record_id).unwrap(); let data = record.data.clone(); @@ -331,16 +326,18 @@ impl VectorIndex for IndexIVFPQ { /// Parameters for IndexIVFPQ. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ParamsIVFPQ { - /// Number of centroids in the IVF. + /// Number of centroids or partitions in the IVF. pub centroids: usize, - /// Number of iterations to run the KMeans algorithm. - pub num_iterations: usize, + /// Maximum number of iterations to run the KMeans algorithm. + pub max_iterations: usize, /// Number of centroids in the PQ sub-space. pub sub_centroids: u8, /// Dimension of the vector after PQ encoding. pub sub_dimension: u8, /// Number of clusters to explore during search. pub num_probes: u8, + /// Fraction of the records for training the initial index. + pub sampling: f32, /// Metric used to compute the distance between vectors. pub metric: DistanceMetric, } @@ -348,11 +345,12 @@ pub struct ParamsIVFPQ { impl Default for ParamsIVFPQ { fn default() -> Self { Self { - num_iterations: 100, centroids: 256, - sub_centroids: 32, - sub_dimension: 16, + max_iterations: 50, + sub_centroids: 16, + sub_dimension: 8, num_probes: 4, + sampling: 0.1, metric: DistanceMetric::Euclidean, } } @@ -368,6 +366,32 @@ impl IndexParams for ParamsIVFPQ { } } +#[derive(Debug)] +struct NearestCentroid { + id: ClusterID, + distance: f32, +} + +impl Eq for NearestCentroid {} + +impl PartialEq for NearestCentroid { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl Ord for NearestCentroid { + fn cmp(&self, other: &Self) -> Ordering { + self.distance.partial_cmp(&other.distance).unwrap_or(Ordering::Equal) + } +} + +impl PartialOrd for NearestCentroid { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + #[cfg(test)] mod tests { use super::*; @@ -387,9 +411,10 @@ mod tests { }; let params = ParamsIVFPQ { - num_iterations: 10, + max_iterations: 10, sub_centroids: 8, sub_dimension: 2, + sampling: 1.0, ..Default::default() }; @@ -405,7 +430,8 @@ mod tests { fn test_ivfpq_index() { let params = ParamsIVFPQ { centroids: 5, - num_iterations: 20, + max_iterations: 20, + sampling: 1.0, ..Default::default() }; diff --git a/src/indices/mod.rs b/src/indices/mod.rs index 01e120f4..80ba6b10 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -8,6 +8,7 @@ use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use sqlx::any::AnyRow; use std::any::Any; +use std::cmp::Ordering; use std::collections::{BinaryHeap, HashMap}; use std::fmt::Debug; use std::path::Path; @@ -347,14 +348,14 @@ impl PartialEq for SearchResult { impl Eq for SearchResult {} impl PartialOrd for SearchResult { - fn partial_cmp(&self, other: &Self) -> Option { + fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl Ord for SearchResult { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.distance.partial_cmp(&other.distance).unwrap() + fn cmp(&self, other: &Self) -> Ordering { + self.distance.partial_cmp(&other.distance).unwrap_or(Ordering::Equal) } } @@ -432,6 +433,11 @@ pub trait VectorIndex: Debug + Send + Sync { /// Returns the number of records in the index. fn len(&self) -> usize; + /// Checks if the index has no records. + fn is_empty(&self) -> bool { + self.len() == 0 + } + /// Returns the index as Any type for dynamic casting. /// /// This method allows the index trait object to be downcast to a @@ -507,16 +513,23 @@ mod index_tests { } index.build(records).unwrap(); + assert_eq!(index.len(), 100); } pub fn test_basic_search(index: &impl VectorIndex) { let query = Vector::from(vec![0.0; 128]); let k = 10; - let results = index.search(query, k, Filters::NONE).unwrap(); + let results: Vec = index + .search(query, k, Filters::NONE) + .unwrap() + .iter() + .map(|result| result.id) + .collect(); assert_eq!(results.len(), k); - assert_eq!(results[0].id, RecordID(0)); - assert_eq!(results[9].id, RecordID(9)); + for i in 0..k { + assert!(results.contains(&RecordID(i as u32))); + } } pub fn test_advanced_search(index: &impl VectorIndex) { diff --git a/src/types/distance.rs b/src/types/distance.rs index 32fe8725..b59f2053 100644 --- a/src/types/distance.rs +++ b/src/types/distance.rs @@ -1,3 +1,5 @@ +#![allow(unreachable_code)] + use crate::types::record::Vector; use serde::{Deserialize, Serialize}; diff --git a/src/utils/kmeans.rs b/src/utils/kmeans.rs index 5016c496..9ae98469 100644 --- a/src/utils/kmeans.rs +++ b/src/utils/kmeans.rs @@ -15,6 +15,12 @@ pub type Vectors<'v> = Rc<[&'v Vector]>; #[derive(PartialEq, Eq, PartialOrd, Ord)] pub struct ClusterID(pub u16); +impl ClusterID { + pub fn to_usize(self) -> usize { + self.0 as usize + } +} + /// KMeans clustering model. /// /// KMeans is a simple unsupervised learning algorithm that groups similar @@ -58,9 +64,9 @@ impl KMeans { let mut repeat_count = 0; for _ in 0..self.num_iterations { - // If the centroids don't change for 5 iterations, we assume + // If the centroids don't change for n iterations, we assume // that the algorithm has converged and stop the iterations. - if repeat_count > 5 { + if repeat_count > 3 { break; } @@ -157,6 +163,7 @@ impl KMeans { /// - v1 and v2 are assigned to cluster 0. /// - v3 is assigned to cluster 1. /// - vn is assigned to cluster m. + #[allow(dead_code)] pub fn assignments(&self) -> &[ClusterID] { &self.assignment } diff --git a/tests/common/mod.rs b/tests/common/mod.rs new file mode 100644 index 00000000..30e3dc8f --- /dev/null +++ b/tests/common/mod.rs @@ -0,0 +1,203 @@ +use byteorder::{LittleEndian, ReadBytesExt}; +use curl::easy::Easy; +use flate2::read::GzDecoder; +use sqlx::any::install_default_drivers; +use sqlx::{AnyConnection, Connection, Executor, Row}; +use std::env; +use std::error::Error; +use std::fs::{self, OpenOptions}; +use std::io::{BufReader, BufWriter, Seek, SeekFrom, Write}; +use std::path::PathBuf; +use tar::Archive; + +/// Type of benchmark dataset to use. +/// - `SIFTSMALL`: SIFT small dataset (10k vectors of 128D). +/// - `SIFT`: SIFT dataset (1000k vectors of 128D). +/// - `GIST`: GIST dataset (1M vectors of 960D). +#[allow(dead_code)] +#[derive(Debug, Clone, Copy, Default)] +pub enum Dataset { + #[default] + SIFTSMALL, + SIFT, + GIST, +} + +impl Dataset { + /// Returns the name of the dataset in lowercase. + pub fn name(&self) -> &str { + match self { + Dataset::SIFTSMALL => "siftsmall", + Dataset::SIFT => "sift", + Dataset::GIST => "gist", + } + } + + /// Returns the number of vectors in the dataset. + pub fn size(&self) -> usize { + match self { + Dataset::SIFTSMALL => 10_000, + Dataset::SIFT => 1_000_000, + Dataset::GIST => 1_000_000, + } + } + + /// Returns OasysDB SQLite database URL for testing. + pub fn database_url(&self) -> String { + let path = self.tmp_dir().join("sqlite.db"); + format!("sqlite://{}?mode=rwc", path.display()) + } + + /// Populates the test SQL database with the benchmark dataset. + pub async fn populate_database(&self) -> Result<(), Box> { + install_default_drivers(); + self.setup().await?; + + let db_url = self.database_url(); + let mut conn = AnyConnection::connect(&db_url).await?; + + let table_name = self.name(); + let tables = { + let query = "SELECT name FROM sqlite_master WHERE type = 'table'"; + conn.fetch_all(query).await? + }; + + // If the dataset table already exists, return early since the next + // operation is computationally expensive and not needed. + if tables.iter().any(|row| row.get::<&str, usize>(0) == table_name) { + return Ok(()); + } + + let create_table = format!( + "CREATE TABLE IF NOT EXISTS {table_name} ( + id INTEGER PRIMARY KEY, + vector JSON NOT NULL + )" + ); + + conn.execute(create_table.as_ref()).await?; + + let vectors = self.read_vectors()?; + let mut insert_vector = format!( + "INSERT INTO {table_name} (vector) + VALUES" + ); + + for vector in vectors.iter() { + let value = serde_json::to_string(vector)?; + insert_vector.push_str(&format!("\n({value:?}),")); + } + + insert_vector = insert_vector.trim_end_matches(',').to_string(); + conn.execute(insert_vector.as_ref()).await?; + + // Verify that the vectors were inserted correctly. + let count = { + let query = format!("SELECT COUNT(*) FROM {table_name}"); + conn.fetch_one(query.as_ref()).await?.get::(0) + }; + + assert_eq!(count, self.size() as i64); + Ok(()) + } + + /// Downloads and extracts the dataset to a directory. + async fn setup(&self) -> Result<(), Box> { + if !self.compressed_file().try_exists()? { + self.download().await?; + } + + if !self.dataset_file().try_exists()? { + self.extract()?; + } + + Ok(()) + } + + /// Downloads the benchmark dataset from the server. + async fn download(&self) -> Result<(), Box> { + let file = OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(self.compressed_file())?; + + let mut easy = Easy::new(); + easy.url(&self.download_url())?; + + let mut writer = BufWriter::new(file); + easy.write_function(move |data| { + writer.write_all(data).unwrap(); + Ok(data.len()) + })?; + + easy.perform()?; + Ok(()) + } + + /// Extracts the dataset from the compressed file. + fn extract(&self) -> Result<(), Box> { + let path = self.compressed_file(); + let file = OpenOptions::new().read(true).open(path)?; + let mut archive = Archive::new(GzDecoder::new(file)); + archive.unpack(self.tmp_dir())?; + Ok(()) + } + + /// Reads the vectors from the dataset file. + fn read_vectors(&self) -> Result>, Box> { + let file = OpenOptions::new().read(true).open(self.dataset_file())?; + let mut reader = BufReader::new(file); + + let dimension = reader.read_i32::()? as usize; + let vector_size = 4 + dimension * 4; + + let n = reader.seek(SeekFrom::End(0))? as usize / vector_size; + reader.seek(SeekFrom::Start(((0) * vector_size) as u64))?; + + let mut vectors = vec![vec![0f32; n]; dimension]; + for i in 0..n { + for j in 0..dimension { + vectors[j][i] = reader.read_f32::()?; + } + } + + // Transpose the vector. + let rows = vectors.len(); + let cols = vectors[0].len(); + let vectors = (0..cols) + .map(|col| (0..rows).map(|row| vectors[row][col]).collect()) + .collect(); + + Ok(vectors) + } + + /// Returns the URL to download the dataset. + fn download_url(&self) -> String { + let base_url = "ftp://ftp.irisa.fr/local/texmex/corpus/"; + let file = format!("{}.tar.gz", self.name()); + format!("{base_url}/{file}") + } + + /// Returns the path to the compressed file. + fn compressed_file(&self) -> PathBuf { + self.tmp_dir().join(format!("{}.tar.gz", self.name())) + } + + /// Returns the path to the dataset file. + fn dataset_file(&self) -> PathBuf { + self.tmp_dir() + .join(self.name()) + .join(format!("{}_base.fvecs", self.name())) + } + + /// Returns the temporary directory path for testing OasysDB. + fn tmp_dir(&self) -> PathBuf { + let dir = env::temp_dir().join("oasysdb"); + if !dir.exists() { + fs::create_dir_all(&dir).unwrap(); + } + + dir + } +} diff --git a/tests/test_index.rs b/tests/test_index.rs new file mode 100644 index 00000000..8f202311 --- /dev/null +++ b/tests/test_index.rs @@ -0,0 +1,55 @@ +use common::Dataset; +use futures::executor; +use oasysdb::prelude::*; +use std::error::Error; + +mod common; + +#[test] +fn test_recall_ivfpq() -> Result<(), Box> { + let dataset = Dataset::SIFTSMALL; + let db_url = dataset.database_url(); + let config = SourceConfig::new(dataset.name(), "id", "vector"); + + executor::block_on(dataset.populate_database())?; + + let db = Database::open("odb_itest", Some(db_url))?; + + // Create the IVFPQ index. + if db.get_index("ivfpq").is_none() { + let params = ParamsIVFPQ::default(); + let algorithm = IndexAlgorithm::IVFPQ(params); + db.create_index("ivfpq", algorithm, config.clone())?; + } + + // Create the Flat index. + if db.get_index("flat").is_none() { + let params = ParamsFlat::default(); + let algorithm = IndexAlgorithm::Flat(params); + db.create_index("flat", algorithm, config)?; + } + + // Perform a search query. + let k = 10; + let iteration = 10; + let query = vec![0.0; 128]; + + let correct_ids: Vec = db + .search_index("flat", query.clone(), k, "")? + .iter() + .map(|result| result.id) + .collect(); + + let mut correct_count = 0; + for _ in 0..iteration { + db.search_index("ivfpq", query.clone(), k, "")?.iter().for_each(|r| { + if correct_ids.contains(&r.id) { + correct_count += 1; + } + }); + } + + let recall = correct_count as f32 / (k * iteration) as f32; + assert!(recall > 0.9); + Ok(()) +} From 58acac507a96f49f4b4b0f9ca5cdf3554699b5fc Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Thu, 25 Jul 2024 13:59:57 -0500 Subject: [PATCH 63/88] feat: improve ivfpq testing --- src/indices/idx_ivfpq.rs | 2 +- src/indices/mod.rs | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs index 3bc993de..474bb9cc 100644 --- a/src/indices/idx_ivfpq.rs +++ b/src/indices/idx_ivfpq.rs @@ -431,7 +431,7 @@ mod tests { let params = ParamsIVFPQ { centroids: 5, max_iterations: 20, - sampling: 1.0, + sampling: 0.5, ..Default::default() }; diff --git a/src/indices/mod.rs b/src/indices/mod.rs index 80ba6b10..ec668a2d 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -527,18 +527,22 @@ mod index_tests { .collect(); assert_eq!(results.len(), k); - for i in 0..k { - assert!(results.contains(&RecordID(i as u32))); - } + assert!(results.contains(&RecordID(0))); } pub fn test_advanced_search(index: &impl VectorIndex) { let query = Vector::from(vec![0.0; 128]); let k = 10; let filters = Filters::from("number > 1010"); - let results = index.search(query, k, filters).unwrap(); + let results: Vec = index + .search(query, k, filters) + .unwrap() + .iter() + .map(|result| result.id) + .collect(); assert_eq!(results.len(), k); - assert_eq!(results[0].id, RecordID(11)); + assert!(results.contains(&RecordID(11))); + assert!(!results.contains(&RecordID(0))); } } From 32a79970402e0d9789dea21d94372125c54c51c7 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Thu, 25 Jul 2024 17:22:53 -0500 Subject: [PATCH 64/88] fix: improve accuracy --- src/indices/idx_ivfpq.rs | 100 +++++++++++++++------------------------ src/types/distance.rs | 9 ---- src/utils/kmeans.rs | 8 +++- 3 files changed, 46 insertions(+), 71 deletions(-) diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs index 474bb9cc..5d603ac6 100644 --- a/src/indices/idx_ivfpq.rs +++ b/src/indices/idx_ivfpq.rs @@ -1,7 +1,6 @@ use super::*; use crate::utils::kmeans::{ClusterID, KMeans, Vectors}; use rand::seq::IteratorRandom; -use std::cmp::Ordering; use std::rc::Rc; /// Inverted File index with Product Quantization. @@ -56,32 +55,14 @@ impl IndexIVFPQ { } } - /// Finds the nearest centroids to a vector for cluster assignments. - /// - `vector`: Full-length vector. - /// - `k`: Number of centroids to find. - fn find_nearest_centroids( - &self, - vector: &Vector, - k: usize, - ) -> Vec { - let mut centroids = BinaryHeap::new(); - for (i, center) in self.centroids.iter().enumerate() { - let id = ClusterID(i as u16); - let distance = self.metric().distance(center, vector); - - let centroid = NearestCentroid { id, distance }; - centroids.push(centroid); - - if centroids.len() > k { - centroids.pop(); - } - } - - centroids - .into_sorted_vec() - .into_iter() - .map(|centroid| centroid.id) - .collect() + fn find_nearest_centroid(&self, vector: &Vector) -> ClusterID { + self.centroids + .par_iter() + .enumerate() + .map(|(i, centroid)| (i, self.metric().distance(vector, centroid))) + .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) + .map(|(i, _)| ClusterID(i as u16)) + .unwrap_or_default() } /// Finds the nearest centroid in the codebook for a subvector. @@ -103,11 +84,11 @@ impl IndexIVFPQ { ) -> usize { self.codebook[part_index] .par_iter() - .map(|centroid| self.metric().distance(centroid, subvector)) .enumerate() + .map(|(i, code)| (i, self.metric().distance(subvector, code))) .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) + .map(|(i, _)| i) .unwrap_or_default() - .0 } /// Quantizes a full-length vector into a PQ vector. @@ -236,10 +217,10 @@ impl VectorIndex for IndexIVFPQ { for (id, record) in records.iter() { let vector = &record.vector; - let cid = self.find_nearest_centroids(vector, 1)[0].to_usize(); + let cid = self.find_nearest_centroid(vector).to_usize(); // The number of records in the cluster. - let count = self.clusters[cid].len() as f32; + let count = self.clusters[cid].len().max(1) as f32; let new_count = count + 1.0; // This updates the centroid of the cluster by taking the @@ -286,14 +267,37 @@ impl VectorIndex for IndexIVFPQ { k: usize, filters: Filters, ) -> Result, Error> { - let nearest_centroids = { - let nprobes = self.params.num_probes as usize; - self.find_nearest_centroids(&query, nprobes) - }; + let mut centroid_distances: Vec<(usize, f32)> = self + .centroids + .par_iter() + .enumerate() + .map(|(i, centroid)| (i, self.metric().distance(centroid, &query))) + .collect(); + centroid_distances + .par_sort_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()); + + let nearest_centroids: Vec = centroid_distances + .iter() + .take(self.params.centroids) + .map(|(i, _)| (*i).into()) + .collect(); + + let mut probes = 0; let mut results = BinaryHeap::new(); + for centroid_id in nearest_centroids { + if probes >= self.params.num_probes { + break; + } + let cluster = &self.clusters[centroid_id.to_usize()]; + if cluster.is_empty() { + continue; + } + + // Empty clusters won't count towards probes. + probes += 1; for &record_id in cluster { let record = self.data.get(&record_id).unwrap(); let data = record.data.clone(); @@ -366,32 +370,6 @@ impl IndexParams for ParamsIVFPQ { } } -#[derive(Debug)] -struct NearestCentroid { - id: ClusterID, - distance: f32, -} - -impl Eq for NearestCentroid {} - -impl PartialEq for NearestCentroid { - fn eq(&self, other: &Self) -> bool { - self.id == other.id - } -} - -impl Ord for NearestCentroid { - fn cmp(&self, other: &Self) -> Ordering { - self.distance.partial_cmp(&other.distance).unwrap_or(Ordering::Equal) - } -} - -impl PartialOrd for NearestCentroid { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/src/types/distance.rs b/src/types/distance.rs index b59f2053..4ed6fce3 100644 --- a/src/types/distance.rs +++ b/src/types/distance.rs @@ -32,15 +32,6 @@ impl DistanceMetric { DistanceMetric::Cosine => Self::cosine(a, b), }; - // Distances of 0 is the best distance. So, we return a large - // value for invalid values to make sure it is not selected. - if dist.is_none() - || dist.unwrap().is_nan() - || dist.unwrap().is_infinite() - { - return f32::MAX; - } - dist.unwrap() as f32 } diff --git a/src/utils/kmeans.rs b/src/utils/kmeans.rs index 9ae98469..19238bab 100644 --- a/src/utils/kmeans.rs +++ b/src/utils/kmeans.rs @@ -21,6 +21,12 @@ impl ClusterID { } } +impl From for ClusterID { + fn from(value: usize) -> Self { + Self(value as u16) + } +} + /// KMeans clustering model. /// /// KMeans is a simple unsupervised learning algorithm that groups similar @@ -141,8 +147,8 @@ impl KMeans { pub fn find_nearest_centroid(&self, vector: &Vector) -> ClusterID { self.centroids .par_iter() - .map(|centroid| self.metric.distance(vector, centroid)) .enumerate() + .map(|(i, centroid)| (i, self.metric.distance(vector, centroid))) .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) .map(|(i, _)| ClusterID(i as u16)) .unwrap_or_default() From c1c0041810a915976a07f0eed1f576b4e2993746 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Thu, 25 Jul 2024 18:27:47 -0500 Subject: [PATCH 65/88] fix: change ivfpq testing params --- tests/test_index.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_index.rs b/tests/test_index.rs index 8f202311..c33be350 100644 --- a/tests/test_index.rs +++ b/tests/test_index.rs @@ -17,7 +17,13 @@ fn test_recall_ivfpq() -> Result<(), Box> { // Create the IVFPQ index. if db.get_index("ivfpq").is_none() { - let params = ParamsIVFPQ::default(); + let params = ParamsIVFPQ { + sub_centroids: 8, + sub_dimension: 16, + sampling: 0.2, + ..Default::default() + }; + let algorithm = IndexAlgorithm::IVFPQ(params); db.create_index("ivfpq", algorithm, config.clone())?; } From c0de27d006ab6726271fb14a5ba6a95d3afddf47 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Fri, 26 Jul 2024 00:23:09 -0500 Subject: [PATCH 66/88] feat: improve integration testing --- tests/common/mod.rs | 24 +++++++++++++----- tests/test_index.rs | 59 ++++++++++++++++++++++++--------------------- 2 files changed, 50 insertions(+), 33 deletions(-) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 30e3dc8f..f5014e16 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -7,7 +7,7 @@ use std::env; use std::error::Error; use std::fs::{self, OpenOptions}; use std::io::{BufReader, BufWriter, Seek, SeekFrom, Write}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use tar::Archive; /// Type of benchmark dataset to use. @@ -77,7 +77,8 @@ impl Dataset { conn.execute(create_table.as_ref()).await?; - let vectors = self.read_vectors()?; + let dataset = self.base_dataset_file(); + let vectors = self.read_vectors(dataset)?; let mut insert_vector = format!( "INSERT INTO {table_name} (vector) VALUES" @@ -107,7 +108,7 @@ impl Dataset { self.download().await?; } - if !self.dataset_file().try_exists()? { + if !self.base_dataset_file().try_exists()? { self.extract()?; } @@ -145,8 +146,12 @@ impl Dataset { } /// Reads the vectors from the dataset file. - fn read_vectors(&self) -> Result>, Box> { - let file = OpenOptions::new().read(true).open(self.dataset_file())?; + /// - `path`: Path to the fvecs file. + pub fn read_vectors( + &self, + path: impl AsRef, + ) -> Result>, Box> { + let file = OpenOptions::new().read(true).open(path)?; let mut reader = BufReader::new(file); let dimension = reader.read_i32::()? as usize; @@ -185,12 +190,19 @@ impl Dataset { } /// Returns the path to the dataset file. - fn dataset_file(&self) -> PathBuf { + pub fn base_dataset_file(&self) -> PathBuf { self.tmp_dir() .join(self.name()) .join(format!("{}_base.fvecs", self.name())) } + /// Returns the path to the query file. + pub fn query_dataset_file(&self) -> PathBuf { + self.tmp_dir() + .join(self.name()) + .join(format!("{}_query.fvecs", self.name())) + } + /// Returns the temporary directory path for testing OasysDB. fn tmp_dir(&self) -> PathBuf { let dir = env::temp_dir().join("oasysdb"); diff --git a/tests/test_index.rs b/tests/test_index.rs index c33be350..5a21aa79 100644 --- a/tests/test_index.rs +++ b/tests/test_index.rs @@ -16,39 +16,41 @@ fn test_recall_ivfpq() -> Result<(), Box> { let db = Database::open("odb_itest", Some(db_url))?; // Create the IVFPQ index. - if db.get_index("ivfpq").is_none() { - let params = ParamsIVFPQ { - sub_centroids: 8, - sub_dimension: 16, - sampling: 0.2, - ..Default::default() - }; - - let algorithm = IndexAlgorithm::IVFPQ(params); - db.create_index("ivfpq", algorithm, config.clone())?; - } + let params = ParamsIVFPQ { + sub_centroids: 8, + sub_dimension: 16, + sampling: 0.1, + ..Default::default() + }; + + let algorithm = IndexAlgorithm::IVFPQ(params); + db.create_index("ivfpq", algorithm, config.clone())?; // Create the Flat index. - if db.get_index("flat").is_none() { - let params = ParamsFlat::default(); - let algorithm = IndexAlgorithm::Flat(params); - db.create_index("flat", algorithm, config)?; - } + let params = ParamsFlat::default(); + let algorithm = IndexAlgorithm::Flat(params); + db.create_index("flat", algorithm, config)?; + + // Perform search queries + let queries = { + let path = dataset.query_dataset_file(); + dataset.read_vectors(path)? + }; - // Perform a search query. let k = 10; let iteration = 10; - let query = vec![0.0; 128]; + let mut correct_count = 0; - let correct_ids: Vec = db - .search_index("flat", query.clone(), k, "")? - .iter() - .map(|result| result.id) - .collect(); + for query in queries.into_iter().take(iteration) { + let vector = Vector::from(query); - let mut correct_count = 0; - for _ in 0..iteration { - db.search_index("ivfpq", query.clone(), k, "")?.iter().for_each(|r| { + let correct_ids: Vec = db + .search_index("flat", vector.clone(), k, "")? + .iter() + .map(|result| result.id) + .collect(); + + db.search_index("ivfpq", vector, k, "")?.iter().for_each(|r| { if correct_ids.contains(&r.id) { correct_count += 1; } @@ -56,6 +58,9 @@ fn test_recall_ivfpq() -> Result<(), Box> { } let recall = correct_count as f32 / (k * iteration) as f32; - assert!(recall > 0.9); + assert!(recall > 0.0); + + // println!("Recall@{k}: {recall}"); + // assert!(false); Ok(()) } From 55f0f14ed83f5d885d7dbb93c65939e1f1269240 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 27 Jul 2024 17:15:55 -0500 Subject: [PATCH 67/88] docs: add oasysdb banner --- readme.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/readme.md b/readme.md index e69de29b..aaaebbca 100644 --- a/readme.md +++ b/readme.md @@ -0,0 +1,5 @@ +![OasysDB Use Case](https://i.postimg.cc/NjZ52kxN/banner.png) + +[![GitHub Stars](https://img.shields.io/github/stars/oasysai/oasysdb?style=for-the-badge&logo=github&logoColor=%23000000&labelColor=%23fcd34d&color=%236b7280)](https://github.com/oasysai/oasysdb) +[![Crates.io](https://img.shields.io/crates/d/oasysdb?style=for-the-badge&logo=rust&logoColor=%23000&label=crates.io&labelColor=%23fdba74&color=%236b7280)](https://crates.io/crates/oasysdb) +[![Discord](https://img.shields.io/badge/chat-%236b7280?style=for-the-badge&logo=discord&logoColor=%23ffffff&label=discord&labelColor=%237289da)](https://discord.gg/bDhQrkqNP4) From 1aaa40ba52ae759c564544df5c12013eabd08058 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 27 Jul 2024 17:52:07 -0500 Subject: [PATCH 68/88] fix: improve banner height --- readme.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readme.md b/readme.md index aaaebbca..124c6016 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,5 @@ -![OasysDB Use Case](https://i.postimg.cc/NjZ52kxN/banner.png) +![OasysDB Use Case](https://i.postimg.cc/bYQFv1bp/banner.png) [![GitHub Stars](https://img.shields.io/github/stars/oasysai/oasysdb?style=for-the-badge&logo=github&logoColor=%23000000&labelColor=%23fcd34d&color=%236b7280)](https://github.com/oasysai/oasysdb) -[![Crates.io](https://img.shields.io/crates/d/oasysdb?style=for-the-badge&logo=rust&logoColor=%23000&label=crates.io&labelColor=%23fdba74&color=%236b7280)](https://crates.io/crates/oasysdb) [![Discord](https://img.shields.io/badge/chat-%236b7280?style=for-the-badge&logo=discord&logoColor=%23ffffff&label=discord&labelColor=%237289da)](https://discord.gg/bDhQrkqNP4) +[![Crates.io](https://img.shields.io/crates/d/oasysdb?style=for-the-badge&logo=rust&logoColor=%23000&label=crates.io&labelColor=%23fdba74&color=%236b7280)](https://crates.io/crates/oasysdb) From e3f3720df1d0f4526e3c839bba6b1f790b7c2da2 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 27 Jul 2024 18:17:27 -0500 Subject: [PATCH 69/88] refactor: move recall test to examples from integration tests --- {tests => examples}/common/mod.rs | 11 ++-- .../measure_recall.rs | 54 +++++++++++-------- src/db/database.rs | 2 +- 3 files changed, 39 insertions(+), 28 deletions(-) rename {tests => examples}/common/mod.rs (95%) rename tests/test_index.rs => examples/measure_recall.rs (73%) diff --git a/tests/common/mod.rs b/examples/common/mod.rs similarity index 95% rename from tests/common/mod.rs rename to examples/common/mod.rs index f5014e16..407249ab 100644 --- a/tests/common/mod.rs +++ b/examples/common/mod.rs @@ -10,10 +10,10 @@ use std::io::{BufReader, BufWriter, Seek, SeekFrom, Write}; use std::path::{Path, PathBuf}; use tar::Archive; -/// Type of benchmark dataset to use. -/// - `SIFTSMALL`: SIFT small dataset (10k vectors of 128D). -/// - `SIFT`: SIFT dataset (1000k vectors of 128D). -/// - `GIST`: GIST dataset (1M vectors of 960D). +/// Type of ANNS benchmark dataset to download and setup. +/// - SIFTSMALL: 10k vectors of 128D. +/// - SIFT: 1M vectors of 128D. +/// - GIST: 1M vectors of 960D. #[allow(dead_code)] #[derive(Debug, Clone, Copy, Default)] pub enum Dataset { @@ -33,7 +33,7 @@ impl Dataset { } } - /// Returns the number of vectors in the dataset. + /// Returns the number of vectors in the base dataset. pub fn size(&self) -> usize { match self { Dataset::SIFTSMALL => 10_000, @@ -68,6 +68,7 @@ impl Dataset { return Ok(()); } + // Use the dataset name as the table name. let create_table = format!( "CREATE TABLE IF NOT EXISTS {table_name} ( id INTEGER PRIMARY KEY, diff --git a/tests/test_index.rs b/examples/measure_recall.rs similarity index 73% rename from tests/test_index.rs rename to examples/measure_recall.rs index 5a21aa79..f0b013dd 100644 --- a/tests/test_index.rs +++ b/examples/measure_recall.rs @@ -5,31 +5,16 @@ use std::error::Error; mod common; -#[test] -fn test_recall_ivfpq() -> Result<(), Box> { +fn main() -> Result<(), Box> { let dataset = Dataset::SIFTSMALL; let db_url = dataset.database_url(); let config = SourceConfig::new(dataset.name(), "id", "vector"); executor::block_on(dataset.populate_database())?; - let db = Database::open("odb_itest", Some(db_url))?; - - // Create the IVFPQ index. - let params = ParamsIVFPQ { - sub_centroids: 8, - sub_dimension: 16, - sampling: 0.1, - ..Default::default() - }; - - let algorithm = IndexAlgorithm::IVFPQ(params); - db.create_index("ivfpq", algorithm, config.clone())?; - - // Create the Flat index. - let params = ParamsFlat::default(); - let algorithm = IndexAlgorithm::Flat(params); - db.create_index("flat", algorithm, config)?; + let db = Database::open("odb_example", Some(db_url))?; + create_index_flat(&db, &config)?; + create_index_ivfpq(&db, &config)?; // Perform search queries let queries = { @@ -58,9 +43,34 @@ fn test_recall_ivfpq() -> Result<(), Box> { } let recall = correct_count as f32 / (k * iteration) as f32; - assert!(recall > 0.0); + println!("Recall@{k}: {recall}"); + + Ok(()) +} + +fn create_index_ivfpq( + db: &Database, + config: &SourceConfig, +) -> Result<(), Box> { + let params = ParamsIVFPQ { + sub_centroids: 8, + sub_dimension: 16, + sampling: 0.1, + ..Default::default() + }; + + let algorithm = IndexAlgorithm::IVFPQ(params); + db.create_index("ivfpq", algorithm, config.clone())?; + Ok(()) +} + +fn create_index_flat( + db: &Database, + config: &SourceConfig, +) -> Result<(), Box> { + let params = ParamsFlat::default(); + let algorithm = IndexAlgorithm::Flat(params); + db.create_index("flat", algorithm, config.clone())?; - // println!("Recall@{k}: {recall}"); - // assert!(false); Ok(()) } diff --git a/src/db/database.rs b/src/db/database.rs index 31b48977..15fb5718 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -596,7 +596,7 @@ mod tests { } fn create_test_database() -> Result { - let path = PathBuf::from("odb_utest"); + let path = PathBuf::from("odb_test"); if path.try_exists()? { fs::remove_dir_all(&path)?; } From 348867021867893796c9ccbf4804ef7e1af7dfc0 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 27 Jul 2024 18:21:02 -0500 Subject: [PATCH 70/88] feat: improve ci/cd steps --- .github/workflows/database-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/database-ci.yml b/.github/workflows/database-ci.yml index 70125487..b992a1ca 100644 --- a/.github/workflows/database-ci.yml +++ b/.github/workflows/database-ci.yml @@ -33,6 +33,7 @@ jobs: clippy-lint: name: Lint code with Clippy + needs: rustfmt-format runs-on: ubuntu-latest steps: - name: Checkout the code From 276f3d59eeb563a2daa11f9c968e3ae8e0ee5f9d Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 27 Jul 2024 20:37:28 -0500 Subject: [PATCH 71/88] feat: use s3 link for banner --- readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 124c6016..1bb24d97 100644 --- a/readme.md +++ b/readme.md @@ -1,4 +1,4 @@ -![OasysDB Use Case](https://i.postimg.cc/bYQFv1bp/banner.png) +![OasysDB Use Case](https://odb-assets.s3.amazonaws.com/banners/0.7.0.png) [![GitHub Stars](https://img.shields.io/github/stars/oasysai/oasysdb?style=for-the-badge&logo=github&logoColor=%23000000&labelColor=%23fcd34d&color=%236b7280)](https://github.com/oasysai/oasysdb) [![Discord](https://img.shields.io/badge/chat-%236b7280?style=for-the-badge&logo=discord&logoColor=%23ffffff&label=discord&labelColor=%237289da)](https://discord.gg/bDhQrkqNP4) From 07dd210dc9b0e5b3b32c0932fa39acf1a81f5cc3 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sat, 27 Jul 2024 21:10:11 -0500 Subject: [PATCH 72/88] feat: add create index validate existing index --- examples/measure_recall.rs | 15 ++++++++++++--- src/db/database.rs | 10 +++++++++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/examples/measure_recall.rs b/examples/measure_recall.rs index f0b013dd..9d65a1f2 100644 --- a/examples/measure_recall.rs +++ b/examples/measure_recall.rs @@ -52,6 +52,11 @@ fn create_index_ivfpq( db: &Database, config: &SourceConfig, ) -> Result<(), Box> { + let index_name = "ivfpq"; + if db.get_index_ref(index_name).is_some() { + return Ok(()); + } + let params = ParamsIVFPQ { sub_centroids: 8, sub_dimension: 16, @@ -60,7 +65,7 @@ fn create_index_ivfpq( }; let algorithm = IndexAlgorithm::IVFPQ(params); - db.create_index("ivfpq", algorithm, config.clone())?; + db.create_index(index_name, algorithm, config.clone())?; Ok(()) } @@ -68,9 +73,13 @@ fn create_index_flat( db: &Database, config: &SourceConfig, ) -> Result<(), Box> { + let index_name = "flat"; + if db.get_index_ref(index_name).is_some() { + return Ok(()); + } + let params = ParamsFlat::default(); let algorithm = IndexAlgorithm::Flat(params); - db.create_index("flat", algorithm, config.clone())?; - + db.create_index(index_name, algorithm, config.clone())?; Ok(()) } diff --git a/src/db/database.rs b/src/db/database.rs index 15fb5718..d8f53edc 100644 --- a/src/db/database.rs +++ b/src/db/database.rs @@ -97,6 +97,15 @@ impl Database { algorithm: IndexAlgorithm, config: SourceConfig, ) -> Result<(), Error> { + let index_name: IndexName = name.into(); + + // Check if the index already exists in the database. + if self.get_index_ref(&index_name).is_some() { + let code = ErrorCode::RequestError; + let message = format!("Index already exists: {index_name}."); + return Err(Error::new(code, message)); + } + // Query the source database for records. let query = config.to_query(); let mut conn = self.state()?.async_connect().await?; @@ -110,7 +119,6 @@ impl Database { records.insert(id, record); } - let index_name: IndexName = name.into(); let index_file = { let uuid = Uuid::new_v4().to_string(); self.indices_dir().join(uuid) From e1f2d817a5d1baf93e2112d94edaf58925224108 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sun, 28 Jul 2024 14:00:55 -0500 Subject: [PATCH 73/88] feat: add update record method and add more testing --- src/indices/idx_flat.rs | 19 +++++-- src/indices/idx_ivfpq.rs | 13 +++-- src/indices/mod.rs | 111 +++++++++++++++++++++++++++++++-------- 3 files changed, 114 insertions(+), 29 deletions(-) diff --git a/src/indices/idx_flat.rs b/src/indices/idx_flat.rs index 1c86aa3f..d2aeb24a 100644 --- a/src/indices/idx_flat.rs +++ b/src/indices/idx_flat.rs @@ -54,6 +54,20 @@ impl VectorIndex for IndexFlat { Ok(()) } + fn update( + &mut self, + records: HashMap, + ) -> Result<(), Error> { + records.into_iter().for_each(|(id, record)| { + if let Some(existing) = self.data.get_mut(&id) { + existing.vector = record.vector; + existing.data = record.data; + } + }); + + Ok(()) + } + fn delete(&mut self, ids: Vec) -> Result<(), Error> { self.data.retain(|id, _| !ids.contains(id)); Ok(()) @@ -118,9 +132,6 @@ mod tests { fn test_flat_index() { let params = ParamsFlat::default(); let mut index = IndexFlat::new(params).unwrap(); - - index_tests::populate_index(&mut index); - index_tests::test_basic_search(&index); - index_tests::test_advanced_search(&index); + index_tests::test_index(&mut index); } } diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs index 5d603ac6..05592ae5 100644 --- a/src/indices/idx_ivfpq.rs +++ b/src/indices/idx_ivfpq.rs @@ -252,6 +252,15 @@ impl VectorIndex for IndexIVFPQ { Ok(()) } + fn update( + &mut self, + records: HashMap, + ) -> Result<(), Error> { + let ids: Vec = records.keys().cloned().collect(); + self.delete(ids)?; + self.insert(records) + } + fn delete(&mut self, ids: Vec) -> Result<(), Error> { self.data.retain(|id, _| !ids.contains(id)); self.clusters.par_iter_mut().for_each(|cluster| { @@ -414,8 +423,6 @@ mod tests { }; let mut index = IndexIVFPQ::new(params).unwrap(); - index_tests::populate_index(&mut index); - index_tests::test_basic_search(&index); - index_tests::test_advanced_search(&index); + index_tests::test_index(&mut index); } } diff --git a/src/indices/mod.rs b/src/indices/mod.rs index ec668a2d..593892ca 100644 --- a/src/indices/mod.rs +++ b/src/indices/mod.rs @@ -410,6 +410,13 @@ pub trait VectorIndex: Debug + Send + Sync { records: HashMap, ) -> Result<(), Error>; + /// Updates records in the index with new values. + /// - `records`: Records to update along with their new values. + fn update( + &mut self, + records: HashMap, + ) -> Result<(), Error>; + /// Deletes records from the index data store. /// - `ids`: List of record IDs to delete from the index. fn delete(&mut self, ids: Vec) -> Result<(), Error>; @@ -498,11 +505,26 @@ mod tests { mod index_tests { use super::*; + const DIMENSION: usize = 128; + const K: usize = 10; + + pub fn test_index(index: &mut impl VectorIndex) { + populate_index(index); + test_search(index); + test_search_with_filters(index); + + // The tests below mutates the index and changes the underlying data. + // Be careful when modifying them! + test_search_after_insert(index); + test_search_after_update(index); + test_search_after_delete(index); + } + pub fn populate_index(index: &mut impl VectorIndex) { let mut records = HashMap::new(); for i in 0..100 { let id = RecordID(i as u32); - let vector = Vector::from(vec![i as f32; 128]); + let vector = Vector::from(vec![i as f32; DIMENSION]); let data = HashMap::from([( "number".into(), Some(DataValue::Integer(1000 + i)), @@ -516,33 +538,78 @@ mod index_tests { assert_eq!(index.len(), 100); } - pub fn test_basic_search(index: &impl VectorIndex) { - let query = Vector::from(vec![0.0; 128]); - let k = 10; - let results: Vec = index - .search(query, k, Filters::NONE) - .unwrap() - .iter() - .map(|result| result.id) - .collect(); - - assert_eq!(results.len(), k); + pub fn test_search(index: &impl VectorIndex) { + let results = search_index(index, Filters::NONE); + assert_eq!(results.len(), K); assert!(results.contains(&RecordID(0))); } - pub fn test_advanced_search(index: &impl VectorIndex) { - let query = Vector::from(vec![0.0; 128]); - let k = 10; + pub fn test_search_with_filters(index: &impl VectorIndex) { let filters = Filters::from("number > 1010"); - let results: Vec = index - .search(query, k, filters) - .unwrap() - .iter() - .map(|result| result.id) - .collect(); + let results = search_index(index, filters); - assert_eq!(results.len(), k); + assert_eq!(results.len(), K); assert!(results.contains(&RecordID(11))); assert!(!results.contains(&RecordID(0))); } + + pub fn test_search_after_insert(index: &mut impl VectorIndex) { + let id = RecordID(100); + let vector = Vector::from(vec![0.1; DIMENSION]); + let data = HashMap::from([( + "number".to_string(), + Some(DataValue::Integer(2000)), + )]); + + let record = Record { vector, data }; + let records = HashMap::from([(id, record)]); + index.insert(records).unwrap(); + + let results = search_index(index, Filters::NONE); + assert_eq!(results.len(), K); + assert!(results.contains(&RecordID(100))); + assert!(results.contains(&RecordID(0))); + } + + pub fn test_search_after_update(index: &mut impl VectorIndex) { + let id = RecordID(0); + let vector = Vector::from(vec![100.0; DIMENSION]); + let data = HashMap::from([( + "number".to_string(), + Some(DataValue::Integer(2000)), + )]); + + let record = Record { vector, data }; + let records = HashMap::from([(id, record)]); + index.update(records).unwrap(); + + let results = search_index(index, Filters::NONE); + assert_eq!(results.len(), K); + assert!(!results.contains(&RecordID(0))); + assert!(results.contains(&RecordID(1))); + } + + pub fn test_search_after_delete(index: &mut impl VectorIndex) { + let ids = vec![RecordID(1), RecordID(2)]; + index.delete(ids).unwrap(); + + let results = search_index(index, Filters::NONE); + assert_eq!(results.len(), K); + assert!(!results.contains(&RecordID(1))); + assert!(!results.contains(&RecordID(2))); + assert!(results.contains(&RecordID(3))); + } + + fn search_index( + index: &impl VectorIndex, + filters: Filters, + ) -> Vec { + let query = Vector::from(vec![0.0; DIMENSION]); + index + .search(query, K, filters) + .unwrap() + .iter() + .map(|result| result.id) + .collect() + } } From 672b4a25803f2f7995d2f1fe7611c21cbbc6885a Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Sun, 28 Jul 2024 21:01:34 -0500 Subject: [PATCH 74/88] docs: add intro and initial quickstart section to readme --- readme.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/readme.md b/readme.md index 1bb24d97..fffc762c 100644 --- a/readme.md +++ b/readme.md @@ -3,3 +3,32 @@ [![GitHub Stars](https://img.shields.io/github/stars/oasysai/oasysdb?style=for-the-badge&logo=github&logoColor=%23000000&labelColor=%23fcd34d&color=%236b7280)](https://github.com/oasysai/oasysdb) [![Discord](https://img.shields.io/badge/chat-%236b7280?style=for-the-badge&logo=discord&logoColor=%23ffffff&label=discord&labelColor=%237289da)](https://discord.gg/bDhQrkqNP4) [![Crates.io](https://img.shields.io/crates/d/oasysdb?style=for-the-badge&logo=rust&logoColor=%23000&label=crates.io&labelColor=%23fdba74&color=%236b7280)](https://crates.io/crates/oasysdb) + +# Introducing OasysDB 👋 + +OasysDB is a hybrid vector database that allows you utilize relational databases +like SQLite and Postgres as a storage engine for your vector data without using +them to compute expensive vector operations. + +This allows you to consolidate your data into a single database and ensure high +data integrity with the ACID properties of traditional databases while also +having a fast and isolated vector indexing layer. + +For more details about OasysDB, please visit the +[Documentation](https://docs.oasysdb.com/). + +# Quickstart 🚀 + +Currently, OasysDB is only available for Rust projects as an embedded database. +We are still working on implementing RPC APIs to allow you to use OasysDB in any +language as a standalone service. + +OasysDB has 2 primary components: **Database** and **Index**. + +- The Database is responsible for managing the vector indices and connecting the + storage engine, the SQL database, to the indices as the data source. OasysDB + uses SQLx to handle the SQL database operations. + +- The Index implements a vector indexing algorithm and is responsible for + storing and querying vectors. The functionality and algorithm of the index + depends on the algorithm you choose when creating the index. From 330087129c27ba93d785b05c02170f2ccddbf725 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 29 Jul 2024 01:59:40 -0500 Subject: [PATCH 75/88] docs: add quickstart guide --- readme.md | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/readme.md b/readme.md index fffc762c..b883e370 100644 --- a/readme.md +++ b/readme.md @@ -1,7 +1,8 @@ ![OasysDB Use Case](https://odb-assets.s3.amazonaws.com/banners/0.7.0.png) [![GitHub Stars](https://img.shields.io/github/stars/oasysai/oasysdb?style=for-the-badge&logo=github&logoColor=%23000000&labelColor=%23fcd34d&color=%236b7280)](https://github.com/oasysai/oasysdb) -[![Discord](https://img.shields.io/badge/chat-%236b7280?style=for-the-badge&logo=discord&logoColor=%23ffffff&label=discord&labelColor=%237289da)](https://discord.gg/bDhQrkqNP4) +[![Discord](https://img.shields.io/badge/chat-%236b7280?style=for-the-badge&logo=discord&logoColor=%23ffffff&label=discord&labelColor=%237289da)][discord] +[![Documentation](https://img.shields.io/badge/read-6b7280?style=for-the-badge&label=oasysdb%20docs&labelColor=14b8a6)][docs] [![Crates.io](https://img.shields.io/crates/d/oasysdb?style=for-the-badge&logo=rust&logoColor=%23000&label=crates.io&labelColor=%23fdba74&color=%236b7280)](https://crates.io/crates/oasysdb) # Introducing OasysDB 👋 @@ -32,3 +33,58 @@ OasysDB has 2 primary components: **Database** and **Index**. - The Index implements a vector indexing algorithm and is responsible for storing and querying vectors. The functionality and algorithm of the index depends on the algorithm you choose when creating the index. + +## Embedded in Rust + +To use OasysDB as an embedded vector database in your Rust project, simply add +it to your Cargo.toml file or run the command below on your terminal: + +```bash +cargo add oasysdb +``` + +When running OasysDB as an embedded database, you have access to both the +Database and Index interfaces. In a rare occassion that you're building a +project that doesn't utilize SQL, you can use the Index interface directly. +Otherwise, the quickstart guide below will show you how to use the Database +interface. + +```rust no_run +// Use the prelude module to import all necessary functionalities. +use oasysdb::prelude::*; +use std::env; + +// Open OasysDB database with connection to SQLite. +// Connection is required for new database but optional for existing ones. +// If the connection is provided, it will overwrite the previous connection. +// The SQL connection will only be used to sync vectors with OasysDB. +let sqlite = "sqlite://sqlite.db"; +let db = Database::open("odb_test", Some(sqlite)).unwrap(); + +// Create a new index with IVFPQ algorithm with default parameters. +let params = ParamsIVFPQ::default(); +let algorithm = IndexAlgorithm::IVFPQ(params); +// Setup where the data of the index will come from. +let config = SourceConfig::new("table", "id", "vector"); +db.create_index("index", algorithm, config).unwrap(); + +// Search the index for nearest neighbors of a query vector. +let query = vec![0.0; 128]; +let filters = ""; // Optional SQL-like filter for the search. +let results = db.search_index("index", query, 10, filters).unwrap(); +``` + +## More Resources + +[![Discord](https://img.shields.io/badge/chat-%236b7280?style=for-the-badge&logo=discord&logoColor=%23ffffff&label=discord&labelColor=%237289da)][discord] +[![Documentation](https://img.shields.io/badge/read-6b7280?style=for-the-badge&label=oasysdb%20docs&labelColor=14b8a6)][docs] + +There are more to OasysDB than what is shown in this Quickstart guide. Please +visit OasysDB's [Documentation][docs] for more information. + +In addition, if you have any question or need help that needs immediate +response, please join our [Discord Server][discord] and I will try my best to +help you as soon as possible. + +[docs]: https://docs.oasysdb.com +[discord]: https://discord.gg/bDhQrkqNP4 From 806b5ad258444322f860c438dddc2d36cdfdd9a2 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 29 Jul 2024 02:05:00 -0500 Subject: [PATCH 76/88] docs: improve quickstart commenting and more resources section --- readme.md | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/readme.md b/readme.md index b883e370..ed905471 100644 --- a/readme.md +++ b/readme.md @@ -56,8 +56,6 @@ use std::env; // Open OasysDB database with connection to SQLite. // Connection is required for new database but optional for existing ones. -// If the connection is provided, it will overwrite the previous connection. -// The SQL connection will only be used to sync vectors with OasysDB. let sqlite = "sqlite://sqlite.db"; let db = Database::open("odb_test", Some(sqlite)).unwrap(); @@ -80,11 +78,10 @@ let results = db.search_index("index", query, 10, filters).unwrap(); [![Documentation](https://img.shields.io/badge/read-6b7280?style=for-the-badge&label=oasysdb%20docs&labelColor=14b8a6)][docs] There are more to OasysDB than what is shown in this Quickstart guide. Please -visit OasysDB's [Documentation][docs] for more information. - -In addition, if you have any question or need help that needs immediate -response, please join our [Discord Server][discord] and I will try my best to -help you as soon as possible. +visit OasysDB's [Documentation][docs] for more information. In addition, if you +have any question or need help that needs immediate response, please join our +[Discord Server][discord] and I will try my best to help you as soon as +possible. [docs]: https://docs.oasysdb.com [discord]: https://discord.gg/bDhQrkqNP4 From 6243fd9440f2892c7ac82b2a7bb1d0dbd2f8413d Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 29 Jul 2024 02:06:54 -0500 Subject: [PATCH 77/88] docs: remove unnecessary import from quickstart --- readme.md | 1 - 1 file changed, 1 deletion(-) diff --git a/readme.md b/readme.md index ed905471..3ca7f637 100644 --- a/readme.md +++ b/readme.md @@ -52,7 +52,6 @@ interface. ```rust no_run // Use the prelude module to import all necessary functionalities. use oasysdb::prelude::*; -use std::env; // Open OasysDB database with connection to SQLite. // Connection is required for new database but optional for existing ones. From 9421267c792e7887a88d8111c1275a3d283476ba Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 29 Jul 2024 17:14:29 -0500 Subject: [PATCH 78/88] feat: improve ivfpq default params --- examples/measure_recall.rs | 8 +------- src/indices/idx_ivfpq.rs | 12 ++++++------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/examples/measure_recall.rs b/examples/measure_recall.rs index 9d65a1f2..ae5cc042 100644 --- a/examples/measure_recall.rs +++ b/examples/measure_recall.rs @@ -57,13 +57,7 @@ fn create_index_ivfpq( return Ok(()); } - let params = ParamsIVFPQ { - sub_centroids: 8, - sub_dimension: 16, - sampling: 0.1, - ..Default::default() - }; - + let params = ParamsIVFPQ::default(); let algorithm = IndexAlgorithm::IVFPQ(params); db.create_index(index_name, algorithm, config.clone())?; Ok(()) diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs index 05592ae5..14d1c23a 100644 --- a/src/indices/idx_ivfpq.rs +++ b/src/indices/idx_ivfpq.rs @@ -344,7 +344,7 @@ pub struct ParamsIVFPQ { /// Maximum number of iterations to run the KMeans algorithm. pub max_iterations: usize, /// Number of centroids in the PQ sub-space. - pub sub_centroids: u8, + pub sub_centroids: usize, /// Dimension of the vector after PQ encoding. pub sub_dimension: u8, /// Number of clusters to explore during search. @@ -358,12 +358,12 @@ pub struct ParamsIVFPQ { impl Default for ParamsIVFPQ { fn default() -> Self { Self { - centroids: 256, - max_iterations: 50, - sub_centroids: 16, + centroids: 512, + max_iterations: 100, + sub_centroids: 256, sub_dimension: 8, - num_probes: 4, - sampling: 0.1, + num_probes: 16, + sampling: 0.25, metric: DistanceMetric::Euclidean, } } From adac95c4781f7839477c98fa8d4600a939e5ad21 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 29 Jul 2024 17:23:57 -0500 Subject: [PATCH 79/88] docs: add contributing docs in readme --- readme.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/readme.md b/readme.md index 3ca7f637..5ab5634b 100644 --- a/readme.md +++ b/readme.md @@ -84,3 +84,20 @@ possible. [docs]: https://docs.oasysdb.com [discord]: https://discord.gg/bDhQrkqNP4 + +# Contributing 🤝 + +The easiest way to contribute to this project is to star this project and share +it with your friends. This will help us grow the community and make the project +more visible to others who might need it. + +If you want to go further and contribute your expertise, we will gladly welcome +your code contributions. For more information and guidance about this, please +see [Contributing to OasysDB](docs/contributing.md). + +If you have a deep experience in the space but don't have the free time to +contribute codes, we also welcome advices, suggestions, or feature requests. We +are also looking for advisors to help guide the project direction and roadmap. + +If you are interested about the project in any way, please join us on [Discord +Server][discord]. Help us grow the community and make OasysDB better 😁 From 2bdc20848ca0cacdd718d6572353fe97c375d3ba Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 29 Jul 2024 18:10:50 -0500 Subject: [PATCH 80/88] docs: add disclaimer in readme --- readme.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/readme.md b/readme.md index 5ab5634b..5df40bc0 100644 --- a/readme.md +++ b/readme.md @@ -101,3 +101,11 @@ are also looking for advisors to help guide the project direction and roadmap. If you are interested about the project in any way, please join us on [Discord Server][discord]. Help us grow the community and make OasysDB better 😁 + +## Disclaimer + +This project is still in the early stages of development. We are actively +working on improving it and we expect the API and functionality to change. We do +not recommend using this in production yet. If you do, however, please let us +know so we can help you with any issues you might encounter as promptly as +possible. From 5ce45094e28a26a5ce96e6a09b3726c6d76c055e Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 29 Jul 2024 18:12:45 -0500 Subject: [PATCH 81/88] fix (ivfpq): type casting for ivfpq params --- src/indices/idx_ivfpq.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs index 14d1c23a..36251e2f 100644 --- a/src/indices/idx_ivfpq.rs +++ b/src/indices/idx_ivfpq.rs @@ -38,7 +38,7 @@ impl IndexIVFPQ { let centroids = { let mut kmeans = KMeans::new( - self.params.sub_centroids as usize, + self.params.sub_centroids, self.params.max_iterations, self.params.metric, ); From 552d41021c4007b6ebf016cecfff95bfd1972151 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 29 Jul 2024 18:31:02 -0500 Subject: [PATCH 82/88] feat: improve contributing docs --- docs/contributing.md | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/contributing.md b/docs/contributing.md index 1c1164a5..8096f526 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -58,17 +58,30 @@ First, you will need to have Rust installed on your machine. We recommend using [rustup][rustup] to install Rust. We also recommend having rust-analyzer installed for your code editor for a better development experience. -TODO: Complete the getting started guide. +All of the functionalities of OasysDB are available in the **src** directory. +The 2 most important modules are **db** and **indices** which respectively +contain the database functionalities and the index implementations. +Additionally, some custom types used throughout the project are defined in the +**types** module. + +Before you start working on the code, I recommend you to run the tests to make +sure everything is working as expected. You can run the tests with the following +command: + +```sh +cargo test +``` ## Style guide We mostly use the default linting and style guide for Rust except for some -linting changes listed in rustfmt.toml file. For more information about the code -style, see the [Rust Style Guide][style_guide]. +linting changes listed in the rustfmt.toml file. For more information about the +code style, see the [Rust Style Guide][style_guide]. For commit messages, we use the [Conventional Commits][conventional_commits] format. This allows us to maintain consistency and readability in our Git commit -history. +history making it easier to understand the changes made to the codebase at a +high-level. When commenting your code, please try your best to write comments that are clear and concise with proper English sentence capitalization and punctuation. This From e9983ed891add0b12bfbd484d51c08554670a972 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Mon, 29 Jul 2024 20:00:59 -0500 Subject: [PATCH 83/88] docs (website): add introduction file --- docs/index.md | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 2 +- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index e69de29b..73873427 100644 --- a/docs/index.md +++ b/docs/index.md @@ -0,0 +1,60 @@ +# Welcome to OasysDB 🎉 + +First of all, thank you for considering to use OasysDB! We hope that OasysDB +will help you build your AI projects faster and more efficiently. + +Before you dive deep into OasysDB, these are a few things you should know about +OasysDB and why you should or shouldn't use it. + +## What is OasysDB? + +OasysDB is a hybrid vector database that allows you to have a vector index layer +for similarity search with SQL database as your primary storage. For real-time +and constantly changing data, this means you can use SQL databases like +PostgreSQL, MySQL, or SQLite which offer ACID compliance and strong +transactional support as your primary storage layer and only use OasysDB for +similarity search. + +![OasysDB Use Case](https://odb-assets.s3.amazonaws.com/banners/0.7.0.png) + +## Features + +
+ + +- **SQL Storage Layer** + + OasysDB allows you to consolidate your vector data with other operational + data in a single SQL database without impacting the performance of your + SQL database. + +- **Flexible Indexing** + + You can pick your own poison by choosing indexing algorithms that fit your + use case like Flat (Brute Force) or IVFPQ. You can also configure the index + to fit your performance requirements. + +- **Multi-index Support** + + Depending on your use case and setup, you can create multiple vector + indices for different vector columns from the same table to improve your + search performance. + +- **Pre-filtering** + + In addition to post-filtering, OasysDB supports pre-filtering allowing you + to create an index for a subset of your data to narrow down the search + space before performing the ANN search. + +
+ +## Why not OasysDB? + +- **Fully In-memory**: OasysDB stores the entire index in memory which means + that the size of your index is limited by the memory available on your + machine. If you have a large dataset over 10M vectors, you may want to + consider using a disk-based indexing algorithm. +- **Hybrid Solution**: OasysDB is a hybrid of SQL database and vector indexing + layer. This means that you need to use a SQL database as your primary storage + layer for OasysDB to be optimal. OasysDB, or any other vector databases for + that matter, won't be able to replace a transactional database. diff --git a/mkdocs.yml b/mkdocs.yml index a1e620c9..acfa77be 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -57,7 +57,7 @@ extra_css: - assets/style.css nav: - - Home: + - Documentation: - Introduction: index.md - Other: From 41c7d51111821226c991a7396cc06d2555113c3a Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Tue, 30 Jul 2024 22:54:23 -0500 Subject: [PATCH 84/88] docs: add initial database and index pages --- docs/refs/database.md | 287 ++++++++++++++++++++++++++++++++++++++++++ docs/refs/index.md | 1 + mkdocs.yml | 4 + readme.md | 10 +- 4 files changed, 297 insertions(+), 5 deletions(-) create mode 100644 docs/refs/database.md create mode 100644 docs/refs/index.md diff --git a/docs/refs/database.md b/docs/refs/database.md new file mode 100644 index 00000000..d65208a8 --- /dev/null +++ b/docs/refs/database.md @@ -0,0 +1,287 @@ +# Database + +The Database is the primary interface for interacting with OasysDB. It is +responsible for managing the connection to the SQL database and vector indices. + +These are the most notable operations that can be performed with the Database: + +- Creating a new index. +- Refreshing an existing index. +- Searching for vectors in an index. +- Deleting an index. + +## Create Index + +This method creates a new index in the database. The initial data for the index +will be loaded from the SQL table defined in the source configuration parameter. + +### Parameters + +- **name**: Name of the new index. +- **algorithm**: Vector indexing algorithm to use in the index. +- **source**: Source configuration for the index. + +### Custom Index Parameters + +When specifying the indexing algorithm, we can also pass custom parameters +specific to the algorithm. For example, if we are using the IVFPQ algorithm, we +can configure the number of centroids and the number of sub-quantizers like: + +```json +{ + "centroids": 512, + "max_iterations": 100, + "sub_centroids": 256, + ... +} +``` + +For more information about the available parameters for each algorithm, please +refer to each algorithm's documentation. + +### Source Configuration + +The source configuration defines how the data will be loaded from the SQL +database to create and refresh the index. For example, if we store our vectors +in a table called _embeddings_ and in a column called _vector_, we can define +the source configuration like: + +```json +{ + "table": "embeddings", + "primary_key": "id", + "vector": "vector" + ... +} +``` + +!!! danger "Primary Key Requirement" + + The primary key must be unique and not null with auto-incrementing integer + as its type. This allows OasysDB to incrementally load the data from the + table when refreshing the index. + +!!! danger "Vector Column Requirement" + + The vector must be stored in either JSON (Recommended) or blob column data + type. Without this, OasysDB won't be able to load the vectors from the + source table. + +### Source Metadata (Optional) + +In OasysDB, we can also store metadata along with the vectors directly in the +index which is very useful to eliminate post-search queries to the SQL database. +For example, if we have the following table in SQLite: + +```sql +CREATE TABLE articles ( + id INTEGER PRIMARY KEY, + vector JSON NOT NULL, + content TEXT NOT NULL +); +``` + +We can define the source configuration to store the content in the index: + +```json +{ + "table": "articles", + "primary_key": "id", + "vector": "vector", + "metadata": ["content"] + ... +} +``` + +When we search the index later on, the metadata will be included in the search +results allowing us to use the data right away without querying our SQL database +for the metadata. + +!!! info "Metadata Limitation" + + The metadata is limited to primitive data types like integer, float, string, + and boolean. It's also worth noting that the number and size of the metadata + will affect the overall memory usage of the index. + + Don't overuse it 😁 + +### Source Filter (Optional) + +In the source configuration, we can also define an optional SQL filter to load +only a subset of our data for the index. This filtering will also apply when +refreshing the index incrementally. + +Let's say that we have a SQLite table with the schema below: + +```sql +CREATE TABLE articles ( + id INTEGER PRIMARY KEY, + vector JSON NOT NULL, + content TEXT, + year INTEGER, +); +``` + +We can add a SQL filter to only load the articles from the year 2021: + +```json +{ + "table": "articles", + "primary_key": "id", + "vector": "vector", + "filter": "year = 2021" // Exclude WHERE keyword + ... +} +``` + +!!! warning "SQL Injection Risk" + + Be careful not to use user input directly in the SQL filter as this can + lead to SQL injection attacks. Always sanitize the input before using it + in the filter. + +## Refresh Index + +This method updates an existing index with the latest data from the SQL table. +Under the hood, OasysDB will query the source table from the last primary key +inserted and insert the new data to the index incrementally. + +Incremental insertion is very crucial here because it allows us to insert +individual record to the index without rebuilding the entire index which can be +very slow. + +### Parameters + +- **name**: Name of the index to refresh. + +!!! tip "Asynchronous Refresh" + + The refresh operation can be performed asynchronously. This allows us to + refresh the index in the background and/or periodically without blocking + the main thread. + +## Search Index + +This method performs a nearest neighbor search in the index and returns _K_ +search results based on the query vector. The search results will include the +primary key, distance between the query vector and the result vector, and +optional metadata if defined in the source configuration. + +In JSON format, the search results will look like: + +```json +[ + { + "id": 1, + "distance": 0.123, + "data": { + "content": "OasysDB is awesome!" + } + }, + ... +] +``` + +### Parameters + +- **name**: Name of the index to search. +- **query**: Query vector for the nearest neighbor search. +- **k**: Number of results to return. +- **filters**: Optional SQL-like filter to apply to the search results. + +### Post-filtering (Optional) + +When searching the index, we can additionally apply post-filtering to the search +operation against the metadata stored in the index. + +Let's say that we have the following setup for our index: + +=== "SQLite Table" + + ```sql + CREATE TABLE articles ( + id INTEGER PRIMARY KEY, + vector JSON NOT NULL, + year INTEGER + ); + ``` + +=== "Source Configuration" + + ```json + { + "table": "articles", + "primary_key": "id", + "vector": "vector", + "metadata": ["year"] + } + ``` + +Since we have the year metadata stored in the index, we can apply post-filtering +to the search operation by adding a filter string to the filters parameter: + +```json +{ + "name": "index", + "query": [0.1, 0.2, 0.3, ...], + "k": 10, + "filters": "year = 2021" // SQL-like filtering +} +``` + +This operation will only return the search results where the year metadata is +equal to 2021. There are also other operators we can use for the filtering and +these are the supported operators with their compatible metadata types: + +| Operator | Description | Metadata Type | +| -------- | --------------------- | -------------- | +| = | Equal | All | +| != | Not Equal | All | +| < | Less Than | Integer, Float | +| <= | Less Than or Equal | Integer, Float | +| > | Greater Than | Integer, Float | +| >= | Greater Than or Equal | Integer, Float | +| CONTAINS | Contains | String | + +These operators can also be combined with the **AND** or **OR** logical +operators to create more complex filtering conditions. However, we can only use +one type of join operator at a time. For example: + +```json +{ + ... + "filters": "year >= 2020 AND year <= 2022" +} +``` + +!!! note "Filtering Limitation" + + The filtering is limited to the metadata stored in the index. If we add a + filter with a column that is not included in the metadata, the search + operation will return an empty result since none of the metadata matches + the filter. + +## Delete Index + +This method deletes an existing index from the database and automatically +releases the index from the indices pool if it's loaded. + +Since by default, the index is persisted on disk, deleting the index will also +remove the index file from the disk. This operation is useful when we want to +free up the disk space by removing indices that are no longer needed. + +**This operation is irreversible!** + +### Parameters + +- **name**: Name of the index to delete. + +## Indices Pool + +The Database also contains indices pool to manage multiple indices in-memory. +This is useful when we have multiple indices we frequently use allowing us to +avoid the overhead of loading the index from disk which can be slow. + +By default, performing any operation related to an index like search or refresh +will load the index to the pool. If we want to release the index from the pool, +we can use the `release_indices` method. diff --git a/docs/refs/index.md b/docs/refs/index.md new file mode 100644 index 00000000..8b013d6a --- /dev/null +++ b/docs/refs/index.md @@ -0,0 +1 @@ +# Index diff --git a/mkdocs.yml b/mkdocs.yml index acfa77be..6cb69898 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -59,6 +59,8 @@ extra_css: nav: - Documentation: - Introduction: index.md + - Database: refs/database.md + - Index: refs/index.md - Other: - Changelog: changelog.md @@ -71,8 +73,10 @@ nav: - blog/index.md markdown_extensions: + - admonition - attr_list - md_in_html + - pymdownx.details - pymdownx.inlinehilite - pymdownx.snippets - pymdownx.superfences diff --git a/readme.md b/readme.md index 5df40bc0..a77d29ad 100644 --- a/readme.md +++ b/readme.md @@ -7,9 +7,9 @@ # Introducing OasysDB 👋 -OasysDB is a hybrid vector database that allows you utilize relational databases -like SQLite and Postgres as a storage engine for your vector data without using -them to compute expensive vector operations. +OasysDB is a hybrid vector database that allows you to utilize relational +databases like SQLite and Postgres as a storage engine for your vector data +without using them to compute expensive vector operations. This allows you to consolidate your data into a single database and ensure high data integrity with the ACID properties of traditional databases while also @@ -32,12 +32,12 @@ OasysDB has 2 primary components: **Database** and **Index**. - The Index implements a vector indexing algorithm and is responsible for storing and querying vectors. The functionality and algorithm of the index - depends on the algorithm you choose when creating the index. + depends on the algorithm we choose when creating the index. ## Embedded in Rust To use OasysDB as an embedded vector database in your Rust project, simply add -it to your Cargo.toml file or run the command below on your terminal: +it to the project Cargo.toml file or run the command below on the terminal: ```bash cargo add oasysdb From 6a03bb3dd717942773b8e08b8c9760e04dea61b5 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Tue, 30 Jul 2024 22:55:05 -0500 Subject: [PATCH 85/88] feat: improve filtering and default features --- Cargo.toml | 1 + src/indices/idx_ivfpq.rs | 1 + src/types/record.rs | 8 +++++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 56c06d26..c12e5b60 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ flate2 = "1.0.30" tar = "0.4.41" [features] +default = ["simd"] simd = ["dep:simsimd"] [profile.release] diff --git a/src/indices/idx_ivfpq.rs b/src/indices/idx_ivfpq.rs index 36251e2f..28a54b5e 100644 --- a/src/indices/idx_ivfpq.rs +++ b/src/indices/idx_ivfpq.rs @@ -417,6 +417,7 @@ mod tests { fn test_ivfpq_index() { let params = ParamsIVFPQ { centroids: 5, + sub_centroids: 16, max_iterations: 20, sampling: 0.5, ..Default::default() diff --git a/src/types/record.rs b/src/types/record.rs index b7aa4204..a397dcc9 100644 --- a/src/types/record.rs +++ b/src/types/record.rs @@ -142,7 +142,13 @@ impl From<&str> for DataValue { return boolean.into(); } - DataValue::String(value.to_string()) + let match_quotes = |c: char| c == '\"' || c == '\''; + let value = value + .trim_start_matches(match_quotes) + .trim_end_matches(match_quotes) + .to_string(); + + DataValue::String(value) } } From 255c87b68d68b78bccb7768cbb56c686f2161ab6 Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Wed, 31 Jul 2024 18:39:48 -0500 Subject: [PATCH 86/88] docs: add index documentation --- docs/assets/style.css | 10 +++++ docs/refs/database.md | 2 +- docs/refs/index.md | 1 - docs/refs/vector_index.md | 89 +++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 2 +- 5 files changed, 101 insertions(+), 3 deletions(-) delete mode 100644 docs/refs/index.md create mode 100644 docs/refs/vector_index.md diff --git a/docs/assets/style.css b/docs/assets/style.css index d69c4758..f9814f3f 100644 --- a/docs/assets/style.css +++ b/docs/assets/style.css @@ -4,6 +4,16 @@ h3 { font-weight: bold !important; } +.odb-button { + text-align: center; + width: 100%; +} + +.odb-button.disabled { + opacity: 0.5; + cursor: not-allowed; +} + /* Tables will be displayed at full width. */ .md-typeset__table { diff --git a/docs/refs/database.md b/docs/refs/database.md index d65208a8..97922534 100644 --- a/docs/refs/database.md +++ b/docs/refs/database.md @@ -146,7 +146,7 @@ This method updates an existing index with the latest data from the SQL table. Under the hood, OasysDB will query the source table from the last primary key inserted and insert the new data to the index incrementally. -Incremental insertion is very crucial here because it allows us to insert +Incremental insertion is very crucial here because it allows us to insert an individual record to the index without rebuilding the entire index which can be very slow. diff --git a/docs/refs/index.md b/docs/refs/index.md deleted file mode 100644 index 8b013d6a..00000000 --- a/docs/refs/index.md +++ /dev/null @@ -1 +0,0 @@ -# Index diff --git a/docs/refs/vector_index.md b/docs/refs/vector_index.md new file mode 100644 index 00000000..5a8a37a4 --- /dev/null +++ b/docs/refs/vector_index.md @@ -0,0 +1,89 @@ +# Vector Index + +OasysDB provides a set of vector indexing algorithms that allow us to index +vectors for nearest neighbor search. Each index implementation has its own +characteristics and is suitable for different use cases. + +Some common traits that all index implementations share are: + +- **Incremental Operation**: The ability to modify records in the index + individually without rebuilding the entire index. This includes common + operations such as inserting, updating, and deleting records. +- **Isolated Data Storage**: Each index stored its own data separately from one + another and the source table. This creates an isolated environment for the + index to perform retrieval operations separately from the source table. +- **On-disk Persistence**: This trait allows us to persist the index to a file + and restore it later. This is especially useful when we have an index built + from a large dataset and we want to reuse it later without having to rebuild + the index from scratch. + +??? info "How Persistence Works" + + When we persist an index, we serialize the index data to a + little-endian byte format via the `bincode` crate and write it to a file. + When we restore the index later on, we read the byte data from the file + and deserialize it back as an index object. + +## Index Implementations + +OasysDB provides the following index implementations: + +### Flat Index + +The Flat Index is a simple index that stores vectors in a flat list. When we +search for nearest neighbors in the index, the Flat Index will scan through all +vectors in the index and return the nearest neighbors based on the query vector. + +This index is also known as the brute-force index. + +??? note "Search Complexity: O(DN)" + + - **D**: Dimensionality of the vectors. + - **N**: Number of vectors in the index. + +### IVFPQ Index + +The IVFPQ (Inverted File with Product Quantization) Index is a more advanced +index that uses a combination of inverted files and product quantization to +speed up the nearest neighbor search while maintaining an exceptional memory +efficiency. + +Depending on the configuration, we can customize the index to meet the +performance requirement for our use case. We can adjust the number of clusters, +sub-quantizers, and other parameters to balance recall, memory usage, and search +speed. + +??? note "Search Complexity: O(DK)" + + - **D**: Dimensionality of the vectors. + - **K**: Number of vectors in the cluster to explore. + + This calculation will vary depending on the number of clusters in the index + and clusters to explore during the search. This complexity also depends on + the time it takes to decode the quantized vectors. + + Note: This is a rough estimation of the search complexity. + +## Fine-grained Operations + +When we run OasysDB as an embedded database directly in our application, we gain +access to the low-level index implementations. These implementations allow us to +have a more fine-grained control over the data in the index. + +These are the most notable operations that we can perform with the index: + +- Creating a new index with custom parameters. +- Building the index from a set of records. +- Inserting records into the index incrementally. +- Updating records in the index. +- Deleting records from the index. +- Searching for nearest neighbors in the index. +- Persisting the index to a file. +- Restoring the index from a file. + +These operations allow us to use an index implementation on its own, without +having to rely on the Database interface. For more detailed documentation on the +index implementations, please refer to OasysDB's Rust API documentation. + + +[:fontawesome-brands-rust: Docs.rs](https://docs.rs/oasysdb/latest/oasysdb/){ .md-button .md-button--primary .odb-button } diff --git a/mkdocs.yml b/mkdocs.yml index 6cb69898..7aba3613 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -60,7 +60,7 @@ nav: - Documentation: - Introduction: index.md - Database: refs/database.md - - Index: refs/index.md + - Vector Index: refs/vector_index.md - Other: - Changelog: changelog.md From df49647cebfc6231967830ea677c290ddbf6c09c Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Wed, 31 Jul 2024 20:08:50 -0500 Subject: [PATCH 87/88] chore: improve cargo.toml metadata --- Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c12e5b60..91ba5820 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,13 +7,13 @@ readme = "readme.md" # Information. authors = ["Edwin Kys", "Oasys"] -description = "Fast & scalable vector store with ground-up hybrid ANN search support." +description = "Hybrid vector store with SQL integration & multi-index support." homepage = "https://docs.oasysdb.com" repository = "https://github.com/oasysai/oasysdb" # Metadata. -keywords = ["vector", "database", "anns", "search", "simd"] -categories = ["database", "algorithms", "data-structures"] +keywords = ["embedded", "vector", "database", "search", "anns"] +categories = ["database", "algorithms", "embedded"] [dependencies] uuid = { version = "1.9.1", features = ["v4", "fast-rng", "serde"] } From 84c2b191ab1bf0db78cdc42be76182607c81421e Mon Sep 17 00:00:00 2001 From: Edwin Kys Date: Wed, 31 Jul 2024 20:15:42 -0500 Subject: [PATCH 88/88] docs: improve community management docs --- docs/code_of_conduct.md | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/docs/code_of_conduct.md b/docs/code_of_conduct.md index 5037ebf8..7fa15020 100644 --- a/docs/code_of_conduct.md +++ b/docs/code_of_conduct.md @@ -115,18 +115,8 @@ community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], -version 2.1, available at -[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][source]. - -Community Impact Guidelines were inspired by [Mozilla's code of conduct -enforcement ladder][mozilla_coc]. - -For answers to common questions about this code of conduct, see the FAQ at -[https://www.contributor-covenant.org/faq][faq]. Translations are available at -[https://www.contributor-covenant.org/translations][translations]. +version 2.1. The Community Impact Guidelines were inspired by [Mozilla's Code of +Conduct Enforcement Ladder][mozilla_coc]. [homepage]: https://www.contributor-covenant.org -[source]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html [mozilla_coc]: https://github.com/mozilla/diversity -[faq]: https://www.contributor-covenant.org/faq -[translations]: https://www.contributor-covenant.org/translations