diff --git a/README.md b/README.md index f1d354f..49af607 100644 --- a/README.md +++ b/README.md @@ -1,106 +1,112 @@ # reach -[![Documentation Status](https://readthedocs.org/projects/reach/badge/?version=latest)](https://reach.readthedocs.io/en/latest/?badge=latest) [![PyPI version](https://badge.fury.io/py/reach.svg)](https://badge.fury.io/py/reach) [![Downloads](https://pepy.tech/badge/reach)](https://pepy.tech/project/reach) -A light-weight package for working with pre-trained word embeddings. -Useful for input into neural networks, or for doing compositional semantics. +# Table of contents -`reach` can read in word vectors in `word2vec` or `glove` format without -any preprocessing. +1. [Quickstart](#quickstart) +2. [What do I use it for?](#what-do-i-use-it-for) +3. [Example](#) -The assumption behind `reach` is a no-hassle approach to featurization. The -vectorization and bow approaches know how to deal with OOV words, removing -these problems from your code. +Reach is the lightest-weight vector store. Just put in some vectors, calculate query vectors, and off you go. -`reach` also includes nearest neighbor calculation for arbitrary vectors. +## Quickstart -## Installation - -If you just want `reach`: - -``` +```bash pip install reach ``` -## Example +Assume you've got some vectors and a model. We'll assume you have a nice [model2vec](https://github.com/MinishLab/model2vec) model. ```python -import numpy as np - +from model2vec import StaticModel from reach import Reach -# Load from a .vec or .txt file -# unk_word specifies which token is the "unknown" token. -# If this is token is not in your vector space, it is added as an extra word -# and a corresponding zero vector. -# If it is in your embedding space, it is used. -r = Reach.load("path/to/embeddings", unk_word="UNK") +model = StaticModel.from_pretrained("minishlab/m2v_output_base") +texts = ["dog walked home", "cat walked home", "robot was in his lab"] +vectors = model.encode(texts) + +r = Reach(vectors, texts) +r.most_similar(texts[0]) -# Alternatively, if you have a matrix, you can directly -# input it. +new_text = "robot went to his house" +similarities = r.nearest_neighbor(model.encode(new_text)) -# Stand-in for word embeddings -mtr = np.random.randn(8, 300) -words = ["UNK", "cat", "dog", "best", "creature", "alive", "span", "prose"] -r = Reach(mtr, words, unk_index=0) +print(similarities) -# Get vectors through indexing. -# Throws a KeyError if a word is not present. -vector = r['cat'] +# Store the vector space +r.save("tempo.json") +# Load it again +new_reach = Reach.load("tempo.json") -# Compare two words. -similarity = r.similarity('cat', 'dog') +``` -# Find most similar. -similarities = r.most_similar('cat', 2) +And that's it! -sentence = 'a dog is the best creature alive'.split() -corpus = [sentence, sentence, sentence] +## What do I use it for? -# bow representation consistent with word vectors, -# for input into neural network. -bow = r.bow(sentence) +Reach is an extremely simple but extremely fast vector store. No magic here, it just uses numpy really effectively to obtain impressive speeds. Reach will be fast enough for your RAG projects until 1M vectors, after which you may have to switch to something heavier. -# vectorized representation. -vectorized = r.vectorize(sentence) +Reach is designed to load really quickly from disk, see below, making it ideal for just-in-time projects, such as querying texts on the fly. No need to keep a heavy vector database running, just load your reach, do the computation, and then throw it away. -# can remove OOV words automatically. -vectorized = r.vectorize(sentence, remove_oov=True) +# Examples -# Can mean pool out of the box. -mean = r.mean_pool(sentence) -# Automatically take care of incorrect sentences -# these are set to the vector of the UNK word, or a vector of zeros. -corpus_mean = r.mean_pool_corpus([sentence, sentence, ["not_a_word"]], remove_oov=True, safeguard=False) +Here's some examples and benchmarks. -# vectorize corpus. -transformed = r.transform(corpus) +## Retrieval -# Get nearest words to arbitrary vector -nearest = r.nearest_neighbor(np.random.randn(1, 300)) +For your RAG system, you need fast retrieval. We got it! + +```python +import numpy as np +from reach import Reach -# Get every word within a certain threshold -thresholded = r.threshold("cat", threshold=.0) +dummy_words = list(map(str, range(100_000))) +dummy_vector = np.random.randn(100_000, 768) +r = Reach(dummy_vector, dummy_words) + +# Query with a single vector +x = np.random.randn(768) +%timeit r.nearest_neighbor(x) +# 6.8 ms ± 286 μs per loop (mean ± std. dev. of 7 runs, 100 loops each) + +# Query reach with 10 vectors +x = np.random.randn(10, 768) +%timeit r.nearest_neighbor(x) +# 27.5 ms ± 187 μs per loop (mean ± std. dev. of 7 runs, 10 loops each) +# 2.7 ms per vector + +# 100 vectors. +x = np.random.randn(100, 768) +%timeit r.nearest_neighbor(x) +# 143 ms ± 943 μs per loop (mean ± std. dev. of 7 runs, 10 loops each) +# 1.4 ms per vector ``` -## Loading and saving +# Saving and loading -`reach` has many options for saving and loading files, including custom separators, custom number of dimensions, loading a custom wordlist, custom number of words, and error recovery. One difference between `gensim` and `reach` is that `reach` loads both GloVe-style .vec files and regular word2vec files. Unlike `gensim`, `reach` does not support loading binary files. +No need to keep a vector database in memory, or on some server. Just load and save your thing whenever you need it. -### benchmark +```python +import numpy as np +from reach import Reach -On my machine (a 2022 M1 macbook pro), we get the following times for [`COW BIG`](https://github.com/clips/dutchembeddings), a file containing about 3 million rows and 320 dimensions. +dummy_words = list(map(str, range(100_000))) +dummy_vector = np.random.randn(100_000, 768) +r = Reach(dummy_vector, dummy_words) -| System | Time (7 loops) | -|--------|-------------------| -| Gensim | 3min 57s ± 344 ms | -| reach | 2min 14s ± 4.09 s | +# Loading from disk +r.save("temp.json") +%timeit Reach.load("temp.json") +# 79.9 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) +``` -## Fast format +## Installation -`reach` has a special fast format, which is useful if you want to reload your word vectors often. The fast format can be created using the `save_fast_format` function, and loaded using the `load_fast_format` function. This is about equivalent to saving word vectors in `gensim`'s own format in terms of loading speed. +``` +pip install reach +``` # License diff --git a/reach/reach.py b/reach/reach.py index 0bd354f..281e996 100644 --- a/reach/reach.py +++ b/reach/reach.py @@ -182,7 +182,7 @@ def insert(self, tokens: Sequence[str], vectors: npt.NDArray | None = None) -> N self.vectors = np.concatenate([self.vectors, vectors], 0) @classmethod - def load( + def load_word2vec_format( cls, vector_file: File | str, wordlist: Sequence[str] | None = None, @@ -717,7 +717,7 @@ def union(self, other: Reach, check: bool = True) -> Reach: return Reach(np.stack(vectors), union, name=self.name) - def save(self, path: str, write_header: bool = True) -> None: + def save_word2vec_format(self, path: str, write_header: bool = True) -> None: """ Save the current vector space in word2vec format. @@ -736,7 +736,7 @@ def save(self, path: str, write_header: bool = True) -> None: vec_string = " ".join([str(x) for x in vec]) f.write(f"{w} {vec_string}\n") - def save_fast_format( + def save( self, path: PathLike, overwrite: bool = False, @@ -780,7 +780,7 @@ def save_fast_format( np.save(file_handle, self.vectors) @classmethod - def load_fast_format(cls, filename: PathLike, desired_dtype: Dtype = "float32") -> Reach: + def load(cls, filename: PathLike, desired_dtype: Dtype | None = None) -> Reach: """ Load a reach instance in fast format. @@ -810,7 +810,8 @@ def load_fast_format(cls, filename: PathLike, desired_dtype: Dtype = "float32") with open(numpy_path, "rb") as file_handle: vectors: npt.NDArray = np.load(file_handle) - vectors = vectors.astype(desired_dtype) + if desired_dtype is not None and vectors.dtype != np.dtype(desired_dtype): + vectors = vectors.astype(desired_dtype) instance = cls(vectors, items, name=name, metadata=metadata) instance.unk_token = unk_token diff --git a/tests/test_io.py b/tests/test_io.py index bdd4f18..7dbf401 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -25,11 +25,11 @@ def test_truncation(self) -> None: lines = self.lines() tempfile.write(lines) tempfile.seek(0) - instance = Reach.load(tempfile.name, truncate_embeddings=2) + instance = Reach.load_word2vec_format(tempfile.name, truncate_embeddings=2) self.assertEqual(instance.size, 2) self.assertEqual(len(instance), 6) - instance = Reach.load(tempfile.name, truncate_embeddings=100) + instance = Reach.load_word2vec_format(tempfile.name, truncate_embeddings=100) self.assertEqual(instance.size, 5) self.assertEqual(len(instance), 6) @@ -39,11 +39,11 @@ def test_wordlist(self) -> None: lines = self.lines() tempfile.write(lines) tempfile.seek(0) - instance = Reach.load(tempfile.name, wordlist=("shredder", "krang")) + instance = Reach.load_word2vec_format(tempfile.name, wordlist=("shredder", "krang")) self.assertEqual(len(instance), 2) with self.assertRaises(ValueError): - instance = Reach.load(tempfile.name, wordlist=("doggo",)) + instance = Reach.load_word2vec_format(tempfile.name, wordlist=("doggo",)) def test_duplicate(self) -> None: """Test duplicates in a wordlist.""" @@ -55,8 +55,8 @@ def test_duplicate(self) -> None: tempfile.seek(0) with self.assertRaises(ValueError): - Reach.load(tempfile.name, recover_from_errors=False) - instance = Reach.load(tempfile.name, recover_from_errors=True) + Reach.load_word2vec_format(tempfile.name, recover_from_errors=False) + instance = Reach.load_word2vec_format(tempfile.name, recover_from_errors=True) self.assertEqual(len(instance), 5) def test_unk(self) -> None: @@ -65,16 +65,16 @@ def test_unk(self) -> None: lines = self.lines() tempfile.write(lines) tempfile.seek(0) - instance = Reach.load(tempfile.name, unk_token=None) + instance = Reach.load_word2vec_format(tempfile.name, unk_token=None) self.assertEqual(instance._unk_index, None) desired_dtype = "float32" - instance = Reach.load(tempfile.name, unk_token="[UNK]", desired_dtype=desired_dtype) + instance = Reach.load_word2vec_format(tempfile.name, unk_token="[UNK]", desired_dtype=desired_dtype) self.assertEqual(instance._unk_index, 6) self.assertEqual(instance.items["[UNK]"], instance._unk_index) self.assertEqual(instance.vectors.dtype, desired_dtype) - instance = Reach.load(tempfile.name, unk_token="splinter") + instance = Reach.load_word2vec_format(tempfile.name, unk_token="splinter") self.assertEqual(instance._unk_index, 2) self.assertEqual(instance.items["splinter"], instance._unk_index) @@ -84,13 +84,13 @@ def test_limit(self) -> None: lines = self.lines() tempfile.write(lines) tempfile.seek(0) - instance = Reach.load(tempfile.name, num_to_load=2) + instance = Reach.load_word2vec_format(tempfile.name, num_to_load=2) self.assertEqual(len(instance), 2) with self.assertRaises(ValueError): - instance = Reach.load(tempfile.name, num_to_load=-1) + instance = Reach.load_word2vec_format(tempfile.name, num_to_load=-1) - instance = Reach.load(tempfile.name, num_to_load=10000) + instance = Reach.load_word2vec_format(tempfile.name, num_to_load=10000) self.assertEqual(len(instance), 6) def test_sep(self) -> None: @@ -99,13 +99,13 @@ def test_sep(self) -> None: lines = self.lines(sep=",") tempfile.write(lines) tempfile.seek(0) - Reach.load(tempfile.name, sep=",") + Reach.load_word2vec_format(tempfile.name, sep=",") with NamedTemporaryFile(mode="w+") as tempfile: lines = self.lines(False, sep=",") tempfile.write(lines) tempfile.seek(0) - Reach.load(tempfile.name, sep=",") + Reach.load_word2vec_format(tempfile.name, sep=",") def test_corrupted_file(self) -> None: """Test whether a corrupted file loads.""" @@ -117,9 +117,9 @@ def test_corrupted_file(self) -> None: tempfile.seek(0) with self.assertRaises(ValueError): - instance = Reach.load(tempfile.name) + instance = Reach.load_word2vec_format(tempfile.name) - instance = Reach.load(tempfile.name, recover_from_errors=True) + instance = Reach.load_word2vec_format(tempfile.name, recover_from_errors=True) self.assertEqual(instance.size, 4) self.assertEqual(len(instance.items), 1) self.assertEqual(instance.vectors.shape, (1, 4)) @@ -132,9 +132,9 @@ def test_corrupted_file(self) -> None: tempfile.seek(0) with self.assertRaises(ValueError): - instance = Reach.load(tempfile.name) + instance = Reach.load_word2vec_format(tempfile.name) - instance = Reach.load(tempfile.name, recover_from_errors=True) + instance = Reach.load_word2vec_format(tempfile.name, recover_from_errors=True) self.assertEqual(instance.size, 5) self.assertEqual(len(instance.items), 5) self.assertEqual(instance.vectors.shape, (5, 5)) @@ -147,9 +147,9 @@ def test_corrupted_file(self) -> None: tempfile.seek(0) with self.assertRaises(ValueError): - instance = Reach.load(tempfile.name) + instance = Reach.load_word2vec_format(tempfile.name) - instance = Reach.load(tempfile.name, recover_from_errors=True) + instance = Reach.load_word2vec_format(tempfile.name, recover_from_errors=True) self.assertEqual(instance.size, 5) self.assertEqual(len(instance.items), 5) self.assertEqual(instance.vectors.shape, (5, 5)) @@ -161,7 +161,7 @@ def test_load_from_file_without_header(self) -> None: tempfile.write(lines) tempfile.seek(0) - instance = Reach.load(tempfile.name) + instance = Reach.load_word2vec_format(tempfile.name) self.assertEqual(instance.size, 5) self.assertEqual(len(instance.items), 6) self.assertEqual(instance.vectors.shape, (6, 5)) @@ -171,28 +171,28 @@ def test_load_from_file_without_header(self) -> None: for item, index in instance.items.items(): self.assertEqual(instance.indices[index], item) - instance = Reach.load(tempfile.name, num_to_load=3) + instance = Reach.load_word2vec_format(tempfile.name, num_to_load=3) self.assertEqual(instance.size, 5) self.assertEqual(len(instance.items), 3) self.assertEqual(instance.vectors.shape, (3, 5)) - instance = Reach.load(tempfile.name) + instance = Reach.load_word2vec_format(tempfile.name) with open(tempfile.name) as f: - instance_from_file = Reach.load(f) + instance_from_file = Reach.load_word2vec_format(f) self.assertEqual(instance.size, instance_from_file.size) self.assertTrue(np.all(instance.vectors == instance_from_file.vectors)) self.assertEqual(instance.name, instance_from_file.name) - instance_from_path = Reach.load(Path(tempfile.name)) + instance_from_path = Reach.load_word2vec_format(Path(tempfile.name)) self.assertEqual(instance.size, instance_from_path.size) self.assertTrue(np.all(instance.vectors == instance_from_path.vectors)) self.assertEqual(instance.name, instance_from_path.name) with self.assertRaises(ValueError): - instance = Reach.load(tempfile.name, num_to_load=0) + instance = Reach.load_word2vec_format(tempfile.name, num_to_load=0) with self.assertRaises(ValueError): - instance = Reach.load(tempfile.name, num_to_load=-1) + instance = Reach.load_word2vec_format(tempfile.name, num_to_load=-1) def test_load_from_file_with_header(self) -> None: """Test whether we can load without headers.""" @@ -201,7 +201,7 @@ def test_load_from_file_with_header(self) -> None: tempfile.write(lines) tempfile.seek(0) - instance = Reach.load(tempfile.name) + instance = Reach.load_word2vec_format(tempfile.name) self.assertEqual(instance.size, 5) self.assertEqual(len(instance.items), 6) self.assertEqual(instance.vectors.shape, (6, 5)) @@ -211,28 +211,28 @@ def test_load_from_file_with_header(self) -> None: for item, index in instance.items.items(): self.assertEqual(instance.indices[index], item) - instance = Reach.load(tempfile.name, num_to_load=3) + instance = Reach.load_word2vec_format(tempfile.name, num_to_load=3) self.assertEqual(instance.size, 5) self.assertEqual(len(instance.items), 3) self.assertEqual(instance.vectors.shape, (3, 5)) - instance = Reach.load(tempfile.name) + instance = Reach.load_word2vec_format(tempfile.name) with open(tempfile.name) as f: - instance_from_file = Reach.load(f) + instance_from_file = Reach.load_word2vec_format(f) self.assertEqual(instance.size, instance_from_file.size) self.assertTrue(np.all(instance.vectors == instance_from_file.vectors)) self.assertEqual(instance.name, instance_from_file.name) - instance_from_path = Reach.load(Path(tempfile.name)) + instance_from_path = Reach.load_word2vec_format(Path(tempfile.name)) self.assertEqual(instance.size, instance_from_path.size) self.assertTrue(np.all(instance.vectors == instance_from_path.vectors)) self.assertEqual(instance.name, instance_from_path.name) with self.assertRaises(ValueError): - instance = Reach.load(tempfile.name, num_to_load=0) + instance = Reach.load_word2vec_format(tempfile.name, num_to_load=0) with self.assertRaises(ValueError): - instance = Reach.load(tempfile.name, num_to_load=-1) + instance = Reach.load_word2vec_format(tempfile.name, num_to_load=-1) def test_save_load_fast_format(self) -> None: """Test the saving and loading of the fast format.""" @@ -246,10 +246,10 @@ def test_save_load_fast_format(self) -> None: tempfile.write(lines) tempfile.seek(0) - instance = Reach.load(temp_file_name) + instance = Reach.load_word2vec_format(temp_file_name) fast_format_file = temp_folder_path / "temp.reach" - instance.save_fast_format(fast_format_file) - instance_2 = Reach.load_fast_format(fast_format_file) + instance.save(fast_format_file) + instance_2 = Reach.load(fast_format_file) self.assertEqual(instance.size, instance_2.size) self.assertEqual(len(instance), len(instance_2)) @@ -264,10 +264,10 @@ def test_save_load(self) -> None: tempfile.write(lines) tempfile.seek(0) - instance = Reach.load(tempfile.name) + instance = Reach.load_word2vec_format(tempfile.name) # We know for sure that this writeable. - instance.save(tempfile.name) - instance_2 = Reach.load(tempfile.name) + instance.save_word2vec_format(tempfile.name) + instance_2 = Reach.load_word2vec_format(tempfile.name) self.assertEqual(instance.size, instance_2.size) self.assertEqual(len(instance), len(instance_2))