From 1f3ac136cca8877d226a67cf224938a257894933 Mon Sep 17 00:00:00 2001 From: Magnus Breder Birkenes Date: Wed, 26 Jun 2024 13:55:53 +0200 Subject: [PATCH 1/4] fix docstring --- dhlab/text/conc_coll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhlab/text/conc_coll.py b/dhlab/text/conc_coll.py index e852965..cf805e7 100644 --- a/dhlab/text/conc_coll.py +++ b/dhlab/text/conc_coll.py @@ -173,7 +173,7 @@ def __init__(self, corpus=None, words=None, cutoff=0): :param corpus: target Corpus, defaults to None :param words: list of words to be counted, defaults to None - :param cutoff: frequency cutoff, will not include words with frequency <= cutoff + :param cutoff: frequency cutoff, will not include words with frequency < cutoff """ if corpus is None and words is None: self.freq = pd.DataFrame() From 45792220575719669df2be2e493b7e5f4b12f1ab Mon Sep 17 00:00:00 2001 From: Magnus Breder Birkenes Date: Wed, 26 Jun 2024 16:26:09 +0200 Subject: [PATCH 2/4] feature: add sparse matrix option for counts --- dhlab/api/dhlab_api.py | 37 +++++++++++++++++++++++++++++++++++-- dhlab/text/conc_coll.py | 5 +++-- dhlab/text/corpus.py | 8 ++++---- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/dhlab/api/dhlab_api.py b/dhlab/api/dhlab_api.py index 83ced28..49482ec 100644 --- a/dhlab/api/dhlab_api.py +++ b/dhlab/api/dhlab_api.py @@ -8,6 +8,7 @@ from pandas import DataFrame, Series from dhlab.constants import BASE_URL +from scipy.sparse import dok_matrix pd.options.display.max_rows = 100 @@ -512,9 +513,35 @@ def ngram_news( # df.index = df.index.map(pd.Timestamp) return df +def create_sparse_matrix(structure): + """Create a sparse matrix from an API counts object""" + + # fetch all words + words = list(set(word for dct in structure.values() for word in dct)) + # fetch all dhlabids + dhlabids = list(structure.keys()) + # create an int/dhlabid mapping + dhlabid_to_col = {dhlabid: idx for idx, dhlabid in enumerate(dhlabids)} + # create an int/word mapping + word_to_row = {word: idx for idx, word in enumerate(words)} + + # construct the matrix with each word as a row and each dhlabid as a column (DTM) + num_cols = len(dhlabids) + num_rows = len(words) + sparse_matrix = dok_matrix((num_rows, num_cols), dtype=int) + + # incrementally fill the sparse matrix from dictionary + for col_idx, dhlabid in enumerate(dhlabids): + dct = structure[dhlabid] + for word, value in dct.items(): + row_idx = word_to_row[word] + sparse_matrix[row_idx, col_idx] = value + + df_sparse = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=words, columns=dhlabids) + return df_sparse def get_document_frequencies( - urns: List[str] = None, cutoff: int = 0, words: List[str] = None + urns: List[str] = None, cutoff: int = 0, words: List[str] = None, sparse: bool = False ) -> DataFrame: """Fetch frequency counts of ``words`` in documents (``urns``). @@ -525,6 +552,7 @@ def get_document_frequencies( ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]`` :param int cutoff: minimum frequency of a word to be counted :param list words: a list of words to be counted - if left None, whole document is returned. If not None both the counts and their relative frequency is returned. + :param bool sparse: create a sparse matrix for memory efficiency """ params = locals() r = requests.post(f"{BASE_URL}/frequencies", json=params) @@ -537,7 +565,12 @@ def get_document_frequencies( structure[u[0][0]] = dict([(x[1], x[2]) for x in u]) except IndexError: pass - df = pd.DataFrame(structure) + + if sparse == True: + df = create_sparse_matrix(structure) + else: + df = pd.DataFrame(structure) + df = df.sort_values(by=df.columns[0], ascending=False).fillna(0) else: df = pd.DataFrame(result) diff --git a/dhlab/text/conc_coll.py b/dhlab/text/conc_coll.py index cf805e7..6e3e634 100644 --- a/dhlab/text/conc_coll.py +++ b/dhlab/text/conc_coll.py @@ -168,12 +168,13 @@ def from_df(cls, df): class Counts(DhlabObj): """Provide counts for a corpus - shouldn't be too large""" - def __init__(self, corpus=None, words=None, cutoff=0): + def __init__(self, corpus=None, words=None, cutoff=0, sparse=False): """Get frequency list for Corpus :param corpus: target Corpus, defaults to None :param words: list of words to be counted, defaults to None :param cutoff: frequency cutoff, will not include words with frequency < cutoff + :param sparse: return a sparse matrix for memory efficiency """ if corpus is None and words is None: self.freq = pd.DataFrame() @@ -190,7 +191,7 @@ def __init__(self, corpus=None, words=None, cutoff=0): # count - if words is none result will be as if counting all words # in the corpus self.freq = get_document_frequencies( - urns=urnlist(corpus), cutoff=cutoff, words=words + urns=urnlist(corpus), cutoff=cutoff, words=words, sparse=sparse ) # Include dhlab and title link in object diff --git a/dhlab/text/corpus.py b/dhlab/text/corpus.py index 0426102..361ab44 100644 --- a/dhlab/text/corpus.py +++ b/dhlab/text/corpus.py @@ -218,13 +218,13 @@ def coll( ignore_caps=ignore_caps, ) - def count(self, words=None, cutoff=0): + def count(self, words=None, cutoff=0, sparse=False): """Get word frequencies for corpus""" - return dh.Counts(self, words, cutoff) + return dh.Counts(self, words, cutoff, sparse) - def freq(self, words=None, cutoff=0): + def freq(self, words=None, cutoff=0, sparse=False): """Get word frequencies for corpus""" - return dh.Counts(self, words, cutoff) + return dh.Counts(self, words, cutoff, sparse) @staticmethod def _is_Corpus(corpus: "Corpus") -> bool: From f1ce7e8fbb7a5b72ff79ae043b22f170de0f8c8f Mon Sep 17 00:00:00 2001 From: Magnus Breder Birkenes Date: Wed, 26 Jun 2024 22:24:59 +0200 Subject: [PATCH 3/4] add sparsity check; sum functionality for sparse matrices --- dhlab/text/conc_coll.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/dhlab/text/conc_coll.py b/dhlab/text/conc_coll.py index 6e3e634..4d476f9 100644 --- a/dhlab/text/conc_coll.py +++ b/dhlab/text/conc_coll.py @@ -209,12 +209,34 @@ def __init__(self, corpus=None, words=None, cutoff=0, sparse=False): super().__init__(self.freq) + def is_sparse(self): + """Function to report sparsity of counts frame""" + try: + density = self.freq.sparse.density + if density: + sparse = True + else: + sparse = False + except: + sparse = False + return sparse + def sum(self): """Summarize Corpus frequencies - :return: frequency list for Corpus """ - return self.from_df(self.counts.sum(axis=1).to_frame("freq")) + + # Needed since Pandas seems to make sparse matrices dense when summing + if self.is_sparse() == True: + freqDict = dict() + for x in self.freq.iterrows(): + freqDict[x[0]] = x[1].sum() + + df = pd.DataFrame(freqDict.items(), columns=["word", "freq"]).set_index("word") + df.index.name = None + return self.from_df(df) + else: + return self.from_df(self.counts.sum(axis=1).to_frame("freq")) def display_names(self): "Display data with record names as column titles." From cece7fe36f92738d22236f42ea7ffdd38e82c35c Mon Sep 17 00:00:00 2001 From: Magnus Breder Birkenes Date: Wed, 26 Jun 2024 23:12:08 +0200 Subject: [PATCH 4/4] summing over coo matrices (memory improvement) --- dhlab/text/conc_coll.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/dhlab/text/conc_coll.py b/dhlab/text/conc_coll.py index 4d476f9..5091660 100644 --- a/dhlab/text/conc_coll.py +++ b/dhlab/text/conc_coll.py @@ -228,11 +228,23 @@ def sum(self): # Needed since Pandas seems to make sparse matrices dense when summing if self.is_sparse() == True: + # convert to coo matrix for iteration + coo_matrix = self.freq.sparse.to_coo() + + # get the words and their indices + rowidx = {idx: word for idx, word in enumerate(list(self.freq.index))} + + # build a freq dictionary by looping freqDict = dict() - for x in self.freq.iterrows(): - freqDict[x[0]] = x[1].sum() - df = pd.DataFrame(freqDict.items(), columns=["word", "freq"]).set_index("word") + for i,j,v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data): + word = rowidx[i] + if word in freqDict: + freqDict[word] += v + else: + freqDict[word] = v + + df = pd.DataFrame(freqDict.items(), columns=["word", "freq"]).set_index("word").sort_values(by="freq", ascending=False) df.index.name = None return self.from_df(df) else: