Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FSE migration to Gensim >=4 #65

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ gensim.models.keyedvectors.BaseKeyedVectors class, for example *Word2Vec* or *Fa
```
from gensim.models import FastText
sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
ft = FastText(sentences, min_count=1, size=10)
ft = FastText(sentences, min_count=1, vector_size=10)

from fse import Average, IndexedList
model = Average(ft)
Expand Down
101 changes: 101 additions & 0 deletions fse/models/Idf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Author: Alessandro Muzzi <[email protected]>
# Copyright (C) 2021 Alessandro Muzzi
from typing import List

from math import log
from fse.models.average import Average

from gensim.models.keyedvectors import KeyedVectors

from numpy import float32 as REAL
import numpy as np

import logging

logger = logging.getLogger(__name__)


class Idf(Average):

def __init__(self, model: KeyedVectors, sv_mapfile_path: str = None,
wv_mapfile_path: str = None, workers: int = 1, lang_freq: str = None):
""" Inverse document frequency (Idf)
Because the term "the" is so common, term frequency will tend to incorrectly emphasize documents
which happen to use the word "the" more frequently, without giving enough weight to the more meaningful terms "brown" and "cow".
The term "the" is not a good keyword to distinguish relevant and non-relevant documents and terms,
unlike the less-common words "brown" and "cow". Hence, an inverse document frequency factor is incorporated
which diminishes the weight of terms that occur very frequently in the document set and increases
the weight of terms that occur rarely. Karen Spärck Jones (1972) conceived a statistical interpretation
of term-specificity called Inverse Document Frequency (idf), which became a cornerstone of term weighting:
The specificity of a term can be quantified as an inverse function of the number of documents in which it occurs.

Parameters
----------
model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
the wv.vocab and wv.vector elements are required.
sv_mapfile_path : str, optional
Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
wv_mapfile_path : str, optional
Optional path to store the word-vectors in for very large datasets. Used for memmap.
Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
workers : int, optional
Number of working threads, used for multithreading. For most tasks (few words in a sentence)
a value of 1 should be more than enough.
lang_freq : str, optional
Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
the frequency of a word. As the frequency is required for estimating the word weights, we induce
frequencies into the wv.vocab.count based on :class:`~wordfreq`
If no frequency information is available, you can choose the language to estimate the frequency.
See https://github.com/LuminosoInsight/wordfreq
"""

self.vocab = {}

super(Idf, self).__init__(
model=model, sv_mapfile_path=sv_mapfile_path, wv_mapfile_path=wv_mapfile_path,
workers=workers, lang_freq=lang_freq)

def _pre_train_calls(self, **kwargs):
"""Function calls to perform before training """
self._compute_idf_weights(kwargs)

def _check_parameter_sanity(self):
""" Check the sanity of all paramters """
if not all(self.word_weights >= 0.):
raise ValueError("For Idf, all word weights must be 0 <= w_weight <= 1")

def _compute_idf_weights(self, statistics):
""" Computes the Idf weights for all words in the vocabulary """
logger.info(f"pre-computing Idf weights for {len(self.wv)} words")

words = self.wv.key_to_index
ret = []
if len(words) == 0:
return np.zeros(self.wv.get_dimension())
for word in words:
count = self.vocab.get(word, 0)
if count == 0:
idf_w = 1
else:
idf_w = log(statistics['total_sentences'] / count, 10)

ret.append(idf_w)

self.word_weights = np.array(ret).astype(REAL)

def train(self, sentences: List[tuple] = None, update: bool = False, queue_factor: int = 2, report_delay: int = 5) -> [int, int]:
""" Perform word count before start training to have the counts in _compute_idf_weights()
called by _pre_train_calls() """

for sentence, idx in sentences:
sent = set(sentence)
for word in sent:
self.vocab[word] = self.vocab.get(word, 0) + 1

return super().train(sentences, update, queue_factor, report_delay)


36 changes: 18 additions & 18 deletions fse/models/average.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@

.. sourcecode:: pycon

>>> from gensim.models.word2vec import Word2Vec
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
>>> model = Word2Vec(sentences, min_count=1, size=20)

>>> from fse.models.average import Average
>>> avg = Average(model)
>>> avg.train([(s, i) for i, s in enumerate(sentences)])
>>> avg.sv.vectors.shape
# >>> from gensim.models.word2vec import Word2Vec
# >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
# >>> model = Word2Vec(sentences, min_count=1, vector_size=20)
#
# >>> from fse.models.average import Average
# >>> avg = Average(model)
# >>> avg.train([(s, i) for i, s in enumerate(sentences)])
# >>> avg.sv.vectors.shape
(2, 20)

"""
Expand All @@ -34,8 +34,8 @@

from fse.models.base_s2v import BaseSentence2VecModel

from gensim.models.keyedvectors import BaseKeyedVectors
from gensim.models.utils_any2vec import ft_ngram_hashes
from gensim.models.keyedvectors import KeyedVectors
from gensim.models.fasttext import ft_ngram_hashes

from numpy import (
ndarray,
Expand Down Expand Up @@ -88,7 +88,7 @@ def train_average_np(

"""
size = model.wv.vector_size
vocab = model.wv.vocab
# vocab = model.wv.vocab

w_vectors = model.wv.vectors
w_weights = model.word_weights
Expand Down Expand Up @@ -121,7 +121,7 @@ def train_average_np(
sent = obj[0]
sent_adr = obj[1]

word_indices = [vocab[word].index for word in sent if word in vocab]
word_indices = [model.wv.key_to_index[word] for word in sent if word in model.wv.key_to_index]
eff_sentences += 1
if not len(word_indices):
continue
Expand All @@ -147,11 +147,11 @@ def train_average_np(
eff_words += len(sent) # Counts everything in the sentence

for word in sent:
if word in vocab:
word_index = vocab[word].index
if word in model.wv.key_to_index:
word_index = model.wv.key_to_index[word]
mem += w_vectors[word_index] * w_weights[word_index]
else:
ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket, True)[
ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket)[
:max_ngrams
]
if len(ngram_hashes) == 0:
Expand Down Expand Up @@ -191,7 +191,7 @@ class Average(BaseSentence2VecModel):

Attributes
----------
wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors`
wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
This object essentially contains the mapping between words and embeddings. After training, it can be used
directly to query those embeddings in various ways. See the module level docstring for examples.

Expand All @@ -207,7 +207,7 @@ class Average(BaseSentence2VecModel):

def __init__(
self,
model: BaseKeyedVectors,
model: KeyedVectors,
sv_mapfile_path: str = None,
wv_mapfile_path: str = None,
workers: int = 1,
Expand All @@ -222,7 +222,7 @@ def __init__(

Parameters
----------
model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
the wv.vocab and wv.vector elements are required.
sv_mapfile_path : str, optional
Expand Down
39 changes: 20 additions & 19 deletions fse/models/base_s2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

Attributes
----------
wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors`
wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
This object essentially contains the mapping between words and embeddings. After training, it can be used
directly to query those embeddings in various ways. See the module level docstring for examples.

Expand All @@ -38,8 +38,8 @@

from fse.models.utils import set_madvise_for_mmap

from gensim.models.base_any2vec import BaseWordEmbeddingsModel
from gensim.models.keyedvectors import BaseKeyedVectors, FastTextKeyedVectors, _l2_norm
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from gensim.utils import SaveLoad
from gensim.matutils import zeros_aligned

Expand All @@ -55,6 +55,7 @@
ones,
finfo,
full,
linalg
)

from wordfreq import available_languages, get_frequency_dict
Expand All @@ -81,7 +82,7 @@
class BaseSentence2VecModel(SaveLoad):
def __init__(
self,
model: BaseKeyedVectors,
model: KeyedVectors,
sv_mapfile_path: str = None,
wv_mapfile_path: str = None,
workers: int = 1,
Expand All @@ -96,7 +97,7 @@ def __init__(

Parameters
----------
model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
the wv.vocab and wv.vector elements are required.
sv_mapfile_path : str, optional
Expand Down Expand Up @@ -157,7 +158,7 @@ def __init__(
)
self.prep = BaseSentence2VecPreparer()

self.word_weights = ones(len(self.wv.vocab), REAL)
self.word_weights = ones(len(self.wv), REAL)

def __str__(self) -> str:
"""Human readable representation of the model's state.
Expand All @@ -170,26 +171,26 @@ def __str__(self) -> str:
"""
return f"{self.__class__.__name__} based on {self.wv.__class__.__name__}, size={len(self.sv)}"

def _check_and_include_model(self, model: BaseKeyedVectors):
def _check_and_include_model(self, model: KeyedVectors):
"""Check if the supplied model is a compatible model. Performs all kinds of checks and small optimizations.

Parameters
----------
model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
The model to inject into this class.

"""
if isinstance(model, BaseWordEmbeddingsModel):
if isinstance(model, Word2Vec):
self.wv = model.wv
elif isinstance(model, BaseKeyedVectors):
elif isinstance(model, KeyedVectors):
self.wv = model
else:
raise RuntimeError(
f"Model must be child of BaseWordEmbeddingsModel or BaseKeyedVectors. Received {str(model)}"
f"Model must be child of BaseWordEmbeddingsModel or KeyedVectors. Received {str(model)}"
)
self.wv.vectors_norm = None

if isinstance(self.wv, FastTextKeyedVectors):
if isinstance(self.wv, KeyedVectors):
self.wv.vectors_vocab_norm = None # Save some space
self.wv.vectors_ngrams_norm = None
self.wv.vectors_vocab_norm = None
Expand All @@ -210,8 +211,8 @@ def _check_and_include_model(self, model: BaseKeyedVectors):
raise RuntimeError(
"Word vectors required for sentence embeddings not found."
)
if not hasattr(self.wv, "vocab"):
raise RuntimeError("Vocab required for sentence embeddings not found.")
# if not hasattr(self.wv, "vocab"):
# raise RuntimeError("Vocab required for sentence embeddings not found.")

def _check_language_settings(self, lang_freq: str):
"""Check if the supplied language is a compatible with the wordfreq package
Expand Down Expand Up @@ -243,9 +244,9 @@ def _induce_frequencies(self, domain: int = 2 ** 31 - 1):
freq_dict = get_frequency_dict(self.lang_freq, wordlist="best")
for word in self.wv.index2word:
if word in freq_dict:
self.wv.vocab[word].count = int(freq_dict[word] * domain)
self.wv.set_vecattr(word, "count", int(freq_dict[word] * domain))
else:
self.wv.vocab[word].count = int(1e-8 * domain)
self.wv.set_vecattr(word, "count", int(1e-8 * domain))

def _check_input_data_sanity(self, data_iterable: tuple):
"""Check if the input data complies with the required formats
Expand Down Expand Up @@ -299,7 +300,7 @@ def _check_pre_training_sanity(

"""
if not hasattr(self, "wv") or self.wv is None:
raise RuntimeError("you must first load a valid BaseKeyedVectors object")
raise RuntimeError("you must first load a valid KeyedVectors object")
if not len(self.wv.vectors):
raise RuntimeError(
"you must initialize vectors before computing sentence vectors"
Expand All @@ -314,7 +315,7 @@ def _check_pre_training_sanity(
"you must initialize vectors_vocab before computing sentence vectors"
)

if sum([self.wv.vocab[w].count for w in self.wv.vocab]) == len(self.wv.vocab):
if sum([self.wv.get_vecattr(w, "count") for w in self.wv.key_to_index]) == len(self.wv):
logger.warning(
"The sum of the word counts is equal to its length (all word counts are 1). "
"Make sure to obtain proper word counts by using lang_freq for pretrained embeddings."
Expand Down Expand Up @@ -805,7 +806,7 @@ def infer(self, sentences: List[tuple] = None, use_norm=False) -> ndarray:
self._post_inference_calls(output=output)

if use_norm:
output = _l2_norm(output)
output /= linalg.norm(output, axis=1)
return output

def _train_manager(
Expand Down
6 changes: 3 additions & 3 deletions fse/models/sentencevectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from fse.models.utils import set_madvise_for_mmap

from gensim.models.keyedvectors import BaseKeyedVectors
from gensim.models.keyedvectors import KeyedVectors

from numpy import (
dot,
Expand Down Expand Up @@ -328,7 +328,7 @@ def most_similar(
def similar_by_word(
self,
word: str,
wv: BaseKeyedVectors,
wv: KeyedVectors,
indexable: Union[IndexedList, IndexedLineDocument] = None,
topn: int = 10,
restrict_size: Union[int, Tuple[int, int]] = None,
Expand All @@ -340,7 +340,7 @@ def similar_by_word(
----------
word : str
Word
wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors`
wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
This object essentially contains the mapping between words and embeddings.
indexable: list, IndexedList, IndexedLineDocument
Provides an indexable object from where the most similar sentences are read
Expand Down
Loading