Skip to content

Commit

Permalink
This commit adds some basic memmap support to the `diskannpy.vectors_…
Browse files Browse the repository at this point in the history
…from_file` utility function. You can now return memory mapped np.array conformant view vs. requiring it beloaded fully into memory.
  • Loading branch information
daxpryce committed Oct 16, 2024
1 parent 6f2691c commit ea2db1b
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 5 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "diskannpy"
version = "0.7.0"
version = "0.7.1"

description = "DiskANN Python extension module"
readme = "python/README.md"
Expand Down
19 changes: 15 additions & 4 deletions python/src/_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Licensed under the MIT license.

import warnings
from typing import BinaryIO, NamedTuple
from typing import BinaryIO, Literal, NamedTuple

import numpy as np
import numpy.typing as npt
Expand Down Expand Up @@ -56,19 +56,30 @@ def vectors_to_file(vector_file: str, vectors: VectorLikeBatch) -> None:
_write_bin(vectors, fh)


def vectors_from_file(vector_file: str, dtype: VectorDType) -> npt.NDArray[VectorDType]:
def vectors_from_file(
vector_file: str,
dtype: VectorDType,
use_memmap: bool = False,
mode: Literal["r", "r+"] = "r"
) -> npt.NDArray[VectorDType]:
"""
Read vectors from a DiskANN binary vector file.
### Parameters
- **vector_file**: The path to the vector file to read the vectors from.
- **dtype**: The data type of the vectors in the file. Ensure you match the data types exactly
- **use_memmap**: If True, return a np.memmap, else a standard np.ndarray will be returned
- **mode**: Read-only (r) or read-write (r+) (memmap only). Unlike np.memmap, default is read-only (r)
### Returns
`numpy.typing.NDArray[dtype]`
`numpy.typing.NDArray[dtype] | numpy.memmap`
"""
assert mode in ["r", "r+"]
points, dims = vectors_metadata_from_file(vector_file)
return np.fromfile(file=vector_file, dtype=dtype, offset=8).reshape(points, dims)
if not use_memmap:
return np.fromfile(file=vector_file, dtype=dtype, offset=8).reshape(points, dims)
else:
return np.memmap(vector_file, dtype=dtype, mode=mode, offset=2, shape=(points, dims), order='C')


def tags_to_file(tags_file: str, tags: VectorIdentifierBatch) -> None:
Expand Down
39 changes: 39 additions & 0 deletions python/tests/test_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.

import unittest

import numpy as np

from fixtures import random_vectors, vectors_as_temp_file

import diskannpy as dap


class TestVectorsFromFile(unittest.TestCase):
def test_in_mem(self):
expected = random_vectors(10_000, 100, dtype=np.float32)
with vectors_as_temp_file(expected) as vecs_file:
actual = dap.vectors_from_file(vecs_file, dtype=np.float32)
self.assertTrue(all(expected == actual))

def test_memmap(self):
expected = random_vectors(10_000, 100, dtype=np.float32)
with vectors_as_temp_file(expected) as vecs_file:
actual = dap.vectors_from_file(
vecs_file,
dtype=np.float32,
use_memmap=True
)
self.assertTrue(all(expected == actual))
actual = dap.vectors_from_file(
vecs_file,
dtype=np.float32,
use_memap=True,
mode="r+"
)
self.assertTrue(all(expected == actual))


if __name__ == '__main__':
unittest.main()

0 comments on commit ea2db1b

Please sign in to comment.