From ea2db1bd26ba7142e2a527379ea4338379f57812 Mon Sep 17 00:00:00 2001 From: Dax Pryce Date: Wed, 16 Oct 2024 23:55:41 +0000 Subject: [PATCH] This commit adds some basic memmap support to the `diskannpy.vectors_from_file` utility function. You can now return memory mapped np.array conformant view vs. requiring it beloaded fully into memory. --- pyproject.toml | 2 +- python/src/_files.py | 19 +++++++++++++++---- python/tests/test_files.py | 39 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 5 deletions(-) create mode 100644 python/tests/test_files.py diff --git a/pyproject.toml b/pyproject.toml index f6a39cfe7..d65226288 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta" [project] name = "diskannpy" -version = "0.7.0" +version = "0.7.1" description = "DiskANN Python extension module" readme = "python/README.md" diff --git a/python/src/_files.py b/python/src/_files.py index 1c9fa2103..d5d65c697 100644 --- a/python/src/_files.py +++ b/python/src/_files.py @@ -2,7 +2,7 @@ # Licensed under the MIT license. import warnings -from typing import BinaryIO, NamedTuple +from typing import BinaryIO, Literal, NamedTuple import numpy as np import numpy.typing as npt @@ -56,19 +56,30 @@ def vectors_to_file(vector_file: str, vectors: VectorLikeBatch) -> None: _write_bin(vectors, fh) -def vectors_from_file(vector_file: str, dtype: VectorDType) -> npt.NDArray[VectorDType]: +def vectors_from_file( + vector_file: str, + dtype: VectorDType, + use_memmap: bool = False, + mode: Literal["r", "r+"] = "r" +) -> npt.NDArray[VectorDType]: """ Read vectors from a DiskANN binary vector file. ### Parameters - **vector_file**: The path to the vector file to read the vectors from. - **dtype**: The data type of the vectors in the file. Ensure you match the data types exactly + - **use_memmap**: If True, return a np.memmap, else a standard np.ndarray will be returned + - **mode**: Read-only (r) or read-write (r+) (memmap only). Unlike np.memmap, default is read-only (r) ### Returns - `numpy.typing.NDArray[dtype]` + `numpy.typing.NDArray[dtype] | numpy.memmap` """ + assert mode in ["r", "r+"] points, dims = vectors_metadata_from_file(vector_file) - return np.fromfile(file=vector_file, dtype=dtype, offset=8).reshape(points, dims) + if not use_memmap: + return np.fromfile(file=vector_file, dtype=dtype, offset=8).reshape(points, dims) + else: + return np.memmap(vector_file, dtype=dtype, mode=mode, offset=2, shape=(points, dims), order='C') def tags_to_file(tags_file: str, tags: VectorIdentifierBatch) -> None: diff --git a/python/tests/test_files.py b/python/tests/test_files.py new file mode 100644 index 000000000..97bb0baed --- /dev/null +++ b/python/tests/test_files.py @@ -0,0 +1,39 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +import unittest + +import numpy as np + +from fixtures import random_vectors, vectors_as_temp_file + +import diskannpy as dap + + +class TestVectorsFromFile(unittest.TestCase): + def test_in_mem(self): + expected = random_vectors(10_000, 100, dtype=np.float32) + with vectors_as_temp_file(expected) as vecs_file: + actual = dap.vectors_from_file(vecs_file, dtype=np.float32) + self.assertTrue(all(expected == actual)) + + def test_memmap(self): + expected = random_vectors(10_000, 100, dtype=np.float32) + with vectors_as_temp_file(expected) as vecs_file: + actual = dap.vectors_from_file( + vecs_file, + dtype=np.float32, + use_memmap=True + ) + self.assertTrue(all(expected == actual)) + actual = dap.vectors_from_file( + vecs_file, + dtype=np.float32, + use_memap=True, + mode="r+" + ) + self.assertTrue(all(expected == actual)) + + +if __name__ == '__main__': + unittest.main()