diff --git a/dhlab/text/conc_coll.py b/dhlab/text/conc_coll.py index 8513d43..e852965 100644 --- a/dhlab/text/conc_coll.py +++ b/dhlab/text/conc_coll.py @@ -168,11 +168,12 @@ def from_df(cls, df): class Counts(DhlabObj): """Provide counts for a corpus - shouldn't be too large""" - def __init__(self, corpus=None, words=None): + def __init__(self, corpus=None, words=None, cutoff=0): """Get frequency list for Corpus :param corpus: target Corpus, defaults to None :param words: list of words to be counted, defaults to None + :param cutoff: frequency cutoff, will not include words with frequency <= cutoff """ if corpus is None and words is None: self.freq = pd.DataFrame() @@ -189,7 +190,7 @@ def __init__(self, corpus=None, words=None): # count - if words is none result will be as if counting all words # in the corpus self.freq = get_document_frequencies( - urns=urnlist(corpus), cutoff=0, words=words + urns=urnlist(corpus), cutoff=cutoff, words=words ) # Include dhlab and title link in object diff --git a/dhlab/text/corpus.py b/dhlab/text/corpus.py index e7f7c05..0426102 100644 --- a/dhlab/text/corpus.py +++ b/dhlab/text/corpus.py @@ -218,13 +218,13 @@ def coll( ignore_caps=ignore_caps, ) - def count(self, words=None): + def count(self, words=None, cutoff=0): """Get word frequencies for corpus""" - return dh.Counts(self, words) + return dh.Counts(self, words, cutoff) - def freq(self, words=None): + def freq(self, words=None, cutoff=0): """Get word frequencies for corpus""" - return dh.Counts(self, words) + return dh.Counts(self, words, cutoff) @staticmethod def _is_Corpus(corpus: "Corpus") -> bool: