diff --git a/README.md b/README.md index 48d4277..29e4e32 100644 --- a/README.md +++ b/README.md @@ -113,62 +113,6 @@ On my machine (a 2022 M1 macbook pro), we get the following times for [`COW BIG` `reach` has a special fast format, which is useful if you want to reload your word vectors often. The fast format can be created using the `save_fast_format` function, and loaded using the `load_fast_format` function. This is about equivalent to saving word vectors in `gensim`'s own format in terms of loading speed. -# autoreach - -Reach also has a way of automatically inferring words from strings without using a pre-defined tokenizer, i.e., without splitting the string into words. This is useful because there might be mismatches between the tokenizer you happen to have on hand, and the word vectors you use. For example, if your vector space contains an embedding for the word `"it's"`, and your tokenizer splits this string into two tokens: `["it", "'s"]`, the embedding for `"it's"` will never be found. - -autoreach solves this problem by only finding words from your pre-defined vocabulary in a string, this removing the need for any tokenization. We use the [aho-corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), which allows us to find substrings in linear time. The downside of using aho-corasick is that it also finds substrings of regular words. For example, the word `the` will be found as a substring of `these`. To circumvent this, we perform a regex-based clean-up step. - -**Warning! The clean-up step involves checking for surrounding spaces and punctuation marks. Hence, if the language for which you use Reach does not actually use spaces and/or punctuation marks to designate word boundaries, the entire process might not work.** - -### Example - -```python -import numpy as np - -from reach import AutoReach - -words = ["dog", "walked", "home"] -vectors = np.random.randn(3, 32) - -r = AutoReach(vectors, words) - -sentence = "The dog, walked, home" -bow = r.bow(sentence) - -found_words = [r.indices[index] for index in bow] -``` - -### benchmark - -Because we no longer need to tokenize, `AutoReach` can be many times faster. In this benchmark, we compare to just splitting, and `nltk`'s `word_tokenize` function. - -We will use the entirety of Mary Shelley's Frankenstein, which you can find [here](https://www.gutenberg.org/cache/epub/42324/pg42324.txt), and the glove.6b.50d vectors, which you can find [here](https://nlp.stanford.edu/data/glove.6B.zip). - -```python -from pathlib import Path - -from nltk import word_tokenize - -from reach import AutoReach, Reach - - -txt = Path("pg42324.txt").read_text().lower() -normal_reach = Reach.load("glove.6B.100d.txt") -auto_reach = AutoReach.load("glove.6B.100d.txt") - -# Ipython magic commands -%timeit normal_reach.vectorize(word_tokenize(txt), remove_oov=True) -# 345 ms ± 3.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) -%timeit normal_reach.vectorize(txt.split(), remove_oov=True) -# 25.4 ms ± 132 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) -%timeit auto_reach.vectorize(txt) -# 69.9 ms ± 237 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - -``` - -As you can see, the tokenizer introduces significant overhead compared to just splitting, while using the aho-corasick algorithm to split is still reasonably fast. - # License MIT diff --git a/pyproject.toml b/pyproject.toml index 9cc9d30..091f47c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,6 @@ target-version = "py311" [[tool.mypy.overrides]] module = [ "tqdm.*", - "ahocorasick.*", "setuptools.*", ] ignore_missing_imports = true diff --git a/reach/autoreach.py b/reach/autoreach.py deleted file mode 100644 index a7c24da..0000000 --- a/reach/autoreach.py +++ /dev/null @@ -1,124 +0,0 @@ -import re -from string import punctuation - -try: - from ahocorasick import Automaton -except ImportError as exc: - raise ImportError( - "pyahocorasick is not installed. Please reinstall reach with `pip install" - " reach[auto]`" - ) from exc - -from reach.reach import Matrix, Reach, Tokens - -PUNCT = set(punctuation) -SPACE = set("\n \t") -ALLOWED = PUNCT | SPACE -PUNCT_REGEX = re.compile(r"\W+") - - -class AutoReach(Reach): - """ - A Reach variant that does not require tokenization. - - It uses the aho-corasick algorithm to build an automaton, which is then - used to find candidates in strings. These candidates are then selected - using a "word rule" (see is_valid_token). This rule is now used to languages - that delimit words using spaces. If this is not the case, please subclass - this and write rules that fit your language of choice. - - Parameters - ---------- - vectors : numpy array - The vector space. - items : list - A list of items. Length must be equal to the number of vectors, and - aligned with the vectors. - lowercase : bool or str - This determines whether the string should be lowercased or not before - searching it. If this is set to 'auto', the items in the vector space - are used to determine whether this attribute should be true or false. - name : string, optional, default '' - A string giving the name of the current reach. Only useful if you - have multiple spaces and want to keep track of them. - unk_index : int or None, optional, default None - The index of the UNK item. If this is None, any attempts at vectorizing - OOV items will throw an error. - - Attributes - ---------- - unk_index : int - The integer index of your unknown glyph. This glyph will be inserted - into your BoW space whenever an unknown item is encountered. - name : string - The name of the Reach instance. - - """ - - def __init__( - self, - vectors: Matrix, - items: list[str], - lowercase: str | bool = "auto", - name: str = "", - ) -> None: - """Initialize a Reach instance with an array and list of strings.""" - super().__init__(vectors, items, name) - self.automaton = Automaton() - if not all(isinstance(item, str) for item in self.items): - raise ValueError("All your items should be strings.") - for item, index in self.items.items(): - self.automaton.add_word(item, (item, index)) - self.automaton.make_automaton() - if lowercase == "auto": - # NOTE: we use type ignore here because we know we have strings here. - lowercase = all( - [item == item.lower() for item in self.items] # type: ignore - ) - self._lowercase = bool(lowercase) - - @property - def lowercase(self) -> bool: - """Whether to lowercase a string before searching it.""" - return self._lowercase - - def is_valid_token(self, token: str, tokens: str, end_index: int) -> bool: - """Checks whether a token is valid in the current context.""" - if PUNCT_REGEX.fullmatch(token): - return True - - if tokens[end_index + 1] not in ALLOWED: - return False - if tokens[end_index - (len(token))] not in ALLOWED: - return False - - return True - - def bow(self, tokens: Tokens, remove_oov: bool = True) -> list[int]: - """ - Create a bow representation from a string. - - Parameters - ---------- - tokens : str. - The string from which to extract in vocabulary tokens - remove_oov : bool. - Not used. - - Returns - ------- - bow : list - A BOW representation of the list of items. - - """ - if not isinstance(tokens, str): - raise ValueError("You did not pass a string.") - out = [] - tokens = f" {tokens} " - if self.lowercase: - tokens = tokens.lower() - for end_index, (token, index) in self.automaton.iter_long(tokens): - if self.is_valid_token(token, tokens, end_index): - out.append(index) - - return out diff --git a/reach/reach.py b/reach/reach.py index 382f4a2..679e61a 100644 --- a/reach/reach.py +++ b/reach/reach.py @@ -1006,8 +1006,9 @@ def save_fast_format( metadata = { "unk_token": self.unk_token, "name": self.name, - **(additional_metadata or {}), } + if additional_metadata is not None: + metadata.update(additional_metadata) items = self.sorted_items items_dict = { diff --git a/requirements.txt b/requirements.txt index fde6382..4fd953d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ numpy tqdm -pyahocorasick diff --git a/setup.py b/setup.py index 5d2b141..90fd40e 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,6 @@ license="MIT", packages=find_packages(include=["reach"]), install_requires=["numpy", "tqdm"], - extras_require={"auto": ["pyahocorasick"]}, project_urls={ "Source Code": "https://github.com/stephantul/reach", "Issue Tracker": "https://github.com/stephantul/reach/issues", diff --git a/tests/test_auto.py b/tests/test_auto.py deleted file mode 100644 index 8c9baf6..0000000 --- a/tests/test_auto.py +++ /dev/null @@ -1,101 +0,0 @@ -import unittest - -import numpy as np - -from reach import AutoReach, Reach - - -class TestAuto(unittest.TestCase): - def data(self) -> tuple[list[str], np.ndarray]: - words: list[str] = [ - "donatello", - "leonardo", - "raphael", - "michelangelo", - "splinter", - "hideout", - ] - random_generator = np.random.RandomState(seed=44) - vectors = random_generator.standard_normal((6, 50)) - - return words, vectors - - def test_load(self) -> None: - words, vectors = self.data() - instance = AutoReach(vectors, words) - - self.assertEqual(len(instance.automaton), len(words)) - - normal_instance = Reach(vectors, words) - - self.assertEqual(instance.items, normal_instance.items) - self.assertTrue(np.allclose(instance.vectors, normal_instance.vectors)) - - def test_valid(self) -> None: - words, vectors = self.data() - instance = AutoReach(vectors, words) - - self.assertTrue( - instance.is_valid_token("hideout", "the hideout was hidden", 10) - ) - self.assertTrue( - instance.is_valid_token("hideout", "the hideout, was hidden", 10) - ) - self.assertTrue( - instance.is_valid_token("hideout", "the ,hideout, was hidden", 11) - ) - self.assertFalse( - instance.is_valid_token("hideout", "the hideouts was hidden", 10) - ) - - # Punctuation tokens are always correct - self.assertTrue(instance.is_valid_token(",", "the ,hideouts", 4)) - self.assertTrue(instance.is_valid_token(",", "the ,,,hideouts", 4)) - - # Punctuation is allowed in tokens - self.assertTrue( - instance.is_valid_token("hide-out", "the hide-out was hidden", 11) - ) - self.assertTrue( - instance.is_valid_token("etc.", "we like this and that,etc....", 25) - ) - - def test_lower(self) -> None: - words, vectors = self.data() - instance = AutoReach(vectors, words, lowercase=False) - self.assertFalse(instance.lowercase) - - instance = AutoReach(vectors, words, lowercase=True) - self.assertTrue(instance.lowercase) - - instance = AutoReach(vectors, words, lowercase="auto") - self.assertTrue(instance.lowercase) - - words[0] = words[0].title() # type: ignore - instance = AutoReach(vectors, words, lowercase="auto") - self.assertFalse(instance.lowercase) - - def test_bow(self) -> None: - words, vectors = self.data() - instance = AutoReach(vectors, words) - - result = instance.bow( - "leonardo, raphael, and the other turtles were in their hideout" - ) - self.assertEqual(len(result), 3) - self.assertEqual(result, [1, 2, 5]) - - def test_vectorize(self) -> None: - words, vectors = self.data() - instance = AutoReach(vectors, words) - - result = instance.bow( - "leonardo, raphael, and the other turtles were in their hideout" - ) - - vecs = instance.vectors[result] - vecs2 = instance.vectorize( - "leonardo, raphael, and the other turtles were in their hideout" - ) - - self.assertTrue(np.allclose(vecs, vecs2))