diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md index 4ed6dd48f1..5ed9315b52 100644 --- a/docs/releases/unreleased.md +++ b/docs/releases/unreleased.md @@ -14,6 +14,10 @@ - Make `drift.ADWIN` comply with the reference MOA implementation. +## feature extraction + +- The mini-batch methods for `feature_extraction.TFIDF` now systematically raise an exception, as they are not implemented. + ## stats - Removed the unexported class `stats.CentralMoments`. diff --git a/river/feature_extraction/vectorize.py b/river/feature_extraction/vectorize.py index 5f68450208..e0a9496730 100644 --- a/river/feature_extraction/vectorize.py +++ b/river/feature_extraction/vectorize.py @@ -451,6 +451,8 @@ def __init__( strip_accents=True, lowercase=True, preprocessor: typing.Callable | None = None, + stop_words: set[str] | None = None, + tokenizer_pattern=r"(?u)\b\w[\w\-]+\b", tokenizer: typing.Callable | None = None, ngram_range=(1, 1), ): @@ -459,6 +461,8 @@ def __init__( strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, + stop_words=stop_words, + tokenizer_pattern=tokenizer_pattern, tokenizer=tokenizer, ngram_range=ngram_range, ) @@ -489,3 +493,12 @@ def transform_one(self, x): norm = math.sqrt(sum(tfidf**2 for tfidf in tfidfs.values())) return {term: tfidf / norm for term, tfidf in tfidfs.items()} return tfidfs + + # Mini-batch methods should be done wellâ„¢ and not just be a loop over the *_one equivalent. + def learn_many(self, X): + "Not available, will raise an exception." + raise NotImplementedError + + def transform_many(self, X): + "Not available, will raise an exception." + raise NotImplementedError