oborchers · AleMuzzi · Jan 25, 2022 · Jan 25, 2022 · Jan 26, 2022 · Jan 26, 2022
diff --git a/README.md b/README.md
@@ -151,7 +151,7 @@ gensim.models.keyedvectors.BaseKeyedVectors class, for example *Word2Vec* or *Fa
 ```
 from gensim.models import FastText
 sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
-ft = FastText(sentences, min_count=1, size=10)
+ft = FastText(sentences, min_count=1, vector_size=10)
 
 from fse import Average, IndexedList
 model = Average(ft)

diff --git a/fse/models/Idf.py b/fse/models/Idf.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Alessandro Muzzi <[email protected]>
+# Copyright (C) 2021 Alessandro Muzzi
+from typing import List
+
+from math import log
+from fse.models.average import Average
+
+from gensim.models.keyedvectors import KeyedVectors
+
+from numpy import float32 as REAL
+import numpy as np
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class Idf(Average):
+
+    def __init__(self, model: KeyedVectors, sv_mapfile_path: str = None,
+                 wv_mapfile_path: str = None, workers: int = 1, lang_freq: str = None):
+        """ Inverse document frequency (Idf)
+            Because the term "the" is so common, term frequency will tend to incorrectly emphasize documents
+            which happen to use the word "the" more frequently, without giving enough weight to the more meaningful terms "brown" and "cow".
+            The term "the" is not a good keyword to distinguish relevant and non-relevant documents and terms,
+            unlike the less-common words "brown" and "cow". Hence, an inverse document frequency factor is incorporated
+            which diminishes the weight of terms that occur very frequently in the document set and increases
+            the weight of terms that occur rarely. Karen Spärck Jones (1972) conceived a statistical interpretation
+            of term-specificity called Inverse Document Frequency (idf), which became a cornerstone of term weighting:
+                The specificity of a term can be quantified as an inverse function of the number of documents in which it occurs.
+
+        Parameters
+        ----------
+        model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
+            the wv.vocab and wv.vector elements are required.
+        sv_mapfile_path : str, optional
+            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+        wv_mapfile_path : str, optional
+            Optional path to store the word-vectors in for very large datasets. Used for memmap.
+            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+        workers : int, optional
+            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
+            a value of 1 should be more than enough.
+        lang_freq : str, optional
+            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
+            the frequency of a word. As the frequency is required for estimating the word weights, we induce
+            frequencies into the wv.vocab.count based on :class:`~wordfreq`
+            If no frequency information is available, you can choose the language to estimate the frequency.
+            See https://github.com/LuminosoInsight/wordfreq
+        """
+
+        self.vocab = {}
+
+        super(Idf, self).__init__(
+            model=model, sv_mapfile_path=sv_mapfile_path, wv_mapfile_path=wv_mapfile_path,
+            workers=workers, lang_freq=lang_freq)
+
+    def _pre_train_calls(self, **kwargs):
+        """Function calls to perform before training """
+        self._compute_idf_weights(kwargs)
+
+    def _check_parameter_sanity(self):
+        """ Check the sanity of all paramters """
+        if not all(self.word_weights >= 0.):
+            raise ValueError("For Idf, all word weights must be 0 <= w_weight <= 1")
+
+    def _compute_idf_weights(self, statistics):
+        """ Computes the Idf weights for all words in the vocabulary """
+        logger.info(f"pre-computing Idf weights for {len(self.wv)} words")
+
+        words = self.wv.key_to_index
+        ret = []
+        if len(words) == 0:
+            return np.zeros(self.wv.get_dimension())
+        for word in words:
+            count = self.vocab.get(word, 0)
+            if count == 0:
+                idf_w = 1
+            else:
+                idf_w = log(statistics['total_sentences'] / count, 10)
+
+            ret.append(idf_w)
+
+        self.word_weights = np.array(ret).astype(REAL)
+
+    def train(self, sentences: List[tuple] = None, update: bool = False, queue_factor: int = 2, report_delay: int = 5) -> [int, int]:
+        """ Perform word count before start training to have the counts in _compute_idf_weights()
+            called by _pre_train_calls() """
+
+        for sentence, idx in sentences:
+            sent = set(sentence)
+            for word in sent:
+                self.vocab[word] = self.vocab.get(word, 0) + 1
+
+        return super().train(sentences, update, queue_factor, report_delay)
+
+
diff --git a/fse/models/average.py b/fse/models/average.py
@@ -18,14 +18,14 @@
 
 .. sourcecode:: pycon
 
-        >>> from gensim.models.word2vec import Word2Vec
-        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
-        >>> model = Word2Vec(sentences, min_count=1, size=20)
-
-        >>> from fse.models.average import Average        
-        >>> avg = Average(model)
-        >>> avg.train([(s, i) for i, s in enumerate(sentences)])
-        >>> avg.sv.vectors.shape
+        # >>> from gensim.models.word2vec import Word2Vec
+        # >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+        # >>> model = Word2Vec(sentences, min_count=1, vector_size=20)
+        #
+        # >>> from fse.models.average import Average
+        # >>> avg = Average(model)
+        # >>> avg.train([(s, i) for i, s in enumerate(sentences)])
+        # >>> avg.sv.vectors.shape
         (2, 20)
 
 """
@@ -34,8 +34,8 @@
 
 from fse.models.base_s2v import BaseSentence2VecModel
 
-from gensim.models.keyedvectors import BaseKeyedVectors
-from gensim.models.utils_any2vec import ft_ngram_hashes
+from gensim.models.keyedvectors import KeyedVectors
+from gensim.models.fasttext import ft_ngram_hashes
 
 from numpy import (
     ndarray,
@@ -88,7 +88,7 @@ def train_average_np(
 
     """
     size = model.wv.vector_size
-    vocab = model.wv.vocab
+    # vocab = model.wv.vocab
 
     w_vectors = model.wv.vectors
     w_weights = model.word_weights
@@ -121,7 +121,7 @@ def train_average_np(
             sent = obj[0]
             sent_adr = obj[1]
 
-            word_indices = [vocab[word].index for word in sent if word in vocab]
+            word_indices = [model.wv.key_to_index[word] for word in sent if word in model.wv.key_to_index]
             eff_sentences += 1
             if not len(word_indices):
                 continue
@@ -147,11 +147,11 @@ def train_average_np(
             eff_words += len(sent)  # Counts everything in the sentence
 
             for word in sent:
-                if word in vocab:
-                    word_index = vocab[word].index
+                if word in model.wv.key_to_index:
+                    word_index = model.wv.key_to_index[word]
                     mem += w_vectors[word_index] * w_weights[word_index]
                 else:
-                    ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket, True)[
+                    ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket)[
                         :max_ngrams
                     ]
                     if len(ngram_hashes) == 0:
@@ -191,7 +191,7 @@ class Average(BaseSentence2VecModel):
 
     Attributes
     ----------
-    wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors`
+    wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
         This object essentially contains the mapping between words and embeddings. After training, it can be used
         directly to query those embeddings in various ways. See the module level docstring for examples.
 
@@ -207,7 +207,7 @@ class Average(BaseSentence2VecModel):
 
     def __init__(
         self,
-        model: BaseKeyedVectors,
+        model: KeyedVectors,
         sv_mapfile_path: str = None,
         wv_mapfile_path: str = None,
         workers: int = 1,
@@ -222,7 +222,7 @@ def __init__(
 
         Parameters
         ----------
-        model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
             This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
             the wv.vocab and wv.vector elements are required.
         sv_mapfile_path : str, optional

diff --git a/fse/models/base_s2v.py b/fse/models/base_s2v.py
@@ -11,7 +11,7 @@
 
 Attributes
 ----------
-wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors`
+wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
     This object essentially contains the mapping between words and embeddings. After training, it can be used
     directly to query those embeddings in various ways. See the module level docstring for examples.
 
@@ -38,8 +38,8 @@
 
 from fse.models.utils import set_madvise_for_mmap
 
-from gensim.models.base_any2vec import BaseWordEmbeddingsModel
-from gensim.models.keyedvectors import BaseKeyedVectors, FastTextKeyedVectors, _l2_norm
+from gensim.models.word2vec import Word2Vec
+from gensim.models.keyedvectors import KeyedVectors
 from gensim.utils import SaveLoad
 from gensim.matutils import zeros_aligned
 
@@ -55,6 +55,7 @@
     ones,
     finfo,
     full,
+    linalg
 )
 
 from wordfreq import available_languages, get_frequency_dict
@@ -81,7 +82,7 @@
 class BaseSentence2VecModel(SaveLoad):
     def __init__(
         self,
-        model: BaseKeyedVectors,
+        model: KeyedVectors,
         sv_mapfile_path: str = None,
         wv_mapfile_path: str = None,
         workers: int = 1,
@@ -96,7 +97,7 @@ def __init__(
 
         Parameters
         ----------
-        model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
             This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
             the wv.vocab and wv.vector elements are required.
         sv_mapfile_path : str, optional
@@ -157,7 +158,7 @@ def __init__(
         )
         self.prep = BaseSentence2VecPreparer()
 
-        self.word_weights = ones(len(self.wv.vocab), REAL)
+        self.word_weights = ones(len(self.wv), REAL)
 
     def __str__(self) -> str:
         """Human readable representation of the model's state.
@@ -170,26 +171,26 @@ def __str__(self) -> str:
         """
         return f"{self.__class__.__name__} based on {self.wv.__class__.__name__}, size={len(self.sv)}"
 
-    def _check_and_include_model(self, model: BaseKeyedVectors):
+    def _check_and_include_model(self, model: KeyedVectors):
         """Check if the supplied model is a compatible model. Performs all kinds of checks and small optimizations.
 
         Parameters
         ----------
-        model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
             The model to inject into this class.
 
         """
-        if isinstance(model, BaseWordEmbeddingsModel):
+        if isinstance(model, Word2Vec):
             self.wv = model.wv
-        elif isinstance(model, BaseKeyedVectors):
+        elif isinstance(model, KeyedVectors):
             self.wv = model
         else:
             raise RuntimeError(
-                f"Model must be child of BaseWordEmbeddingsModel or BaseKeyedVectors. Received {str(model)}"
+                f"Model must be child of BaseWordEmbeddingsModel or KeyedVectors. Received {str(model)}"
             )
         self.wv.vectors_norm = None
 
-        if isinstance(self.wv, FastTextKeyedVectors):
+        if isinstance(self.wv, KeyedVectors):
             self.wv.vectors_vocab_norm = None  # Save some space
             self.wv.vectors_ngrams_norm = None
             self.wv.vectors_vocab_norm = None
@@ -210,8 +211,8 @@ def _check_and_include_model(self, model: BaseKeyedVectors):
             raise RuntimeError(
                 "Word vectors required for sentence embeddings not found."
             )
-        if not hasattr(self.wv, "vocab"):
-            raise RuntimeError("Vocab required for sentence embeddings not found.")
+        # if not hasattr(self.wv, "vocab"):
+        #     raise RuntimeError("Vocab required for sentence embeddings not found.")
 
     def _check_language_settings(self, lang_freq: str):
         """Check if the supplied language is a compatible with the wordfreq package
@@ -243,9 +244,9 @@ def _induce_frequencies(self, domain: int = 2 ** 31 - 1):
         freq_dict = get_frequency_dict(self.lang_freq, wordlist="best")
         for word in self.wv.index2word:
             if word in freq_dict:
-                self.wv.vocab[word].count = int(freq_dict[word] * domain)
+                self.wv.set_vecattr(word, "count", int(freq_dict[word] * domain))
             else:
-                self.wv.vocab[word].count = int(1e-8 * domain)
+                self.wv.set_vecattr(word, "count", int(1e-8 * domain))
 
     def _check_input_data_sanity(self, data_iterable: tuple):
         """Check if the input data complies with the required formats
@@ -299,7 +300,7 @@ def _check_pre_training_sanity(
 
         """
         if not hasattr(self, "wv") or self.wv is None:
-            raise RuntimeError("you must first load a valid BaseKeyedVectors object")
+            raise RuntimeError("you must first load a valid KeyedVectors object")
         if not len(self.wv.vectors):
             raise RuntimeError(
                 "you must initialize vectors before computing sentence vectors"
@@ -314,7 +315,7 @@ def _check_pre_training_sanity(
                 "you must initialize vectors_vocab before computing sentence vectors"
             )
 
-        if sum([self.wv.vocab[w].count for w in self.wv.vocab]) == len(self.wv.vocab):
+        if sum([self.wv.get_vecattr(w, "count") for w in self.wv.key_to_index]) == len(self.wv):
             logger.warning(
                 "The sum of the word counts is equal to its length (all word counts are 1). "
                 "Make sure to obtain proper word counts by using lang_freq for pretrained embeddings."
@@ -805,7 +806,7 @@ def infer(self, sentences: List[tuple] = None, use_norm=False) -> ndarray:
         self._post_inference_calls(output=output)
 
         if use_norm:
-            output = _l2_norm(output)
+            output /= linalg.norm(output, axis=1)
         return output
 
     def _train_manager(

diff --git a/fse/models/sentencevectors.py b/fse/models/sentencevectors.py
@@ -11,7 +11,7 @@
 
 from fse.models.utils import set_madvise_for_mmap
 
-from gensim.models.keyedvectors import BaseKeyedVectors
+from gensim.models.keyedvectors import KeyedVectors
 
 from numpy import (
     dot,
@@ -328,7 +328,7 @@ def most_similar(
     def similar_by_word(
         self,
         word: str,
-        wv: BaseKeyedVectors,
+        wv: KeyedVectors,
         indexable: Union[IndexedList, IndexedLineDocument] = None,
         topn: int = 10,
         restrict_size: Union[int, Tuple[int, int]] = None,
@@ -340,7 +340,7 @@ def similar_by_word(
         ----------
         word : str
             Word
-        wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors`
+        wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
             This object essentially contains the mapping between words and embeddings.
         indexable: list, IndexedList, IndexedLineDocument
             Provides an indexable object from where the most similar sentences are read