From 1f3ac136cca8877d226a67cf224938a257894933 Mon Sep 17 00:00:00 2001
From: Magnus Breder Birkenes <magnus.birkenes@nb.no>
Date: Wed, 26 Jun 2024 13:55:53 +0200
Subject: [PATCH 1/4] fix docstring

---
 dhlab/text/conc_coll.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhlab/text/conc_coll.py b/dhlab/text/conc_coll.py
index e852965..cf805e7 100644
--- a/dhlab/text/conc_coll.py
+++ b/dhlab/text/conc_coll.py
@@ -173,7 +173,7 @@ def __init__(self, corpus=None, words=None, cutoff=0):
 
         :param corpus: target Corpus, defaults to None
         :param words: list of words to be counted, defaults to None
-        :param cutoff: frequency cutoff, will not include words with frequency <= cutoff
+        :param cutoff: frequency cutoff, will not include words with frequency < cutoff
         """
         if corpus is None and words is None:
             self.freq = pd.DataFrame()

From 45792220575719669df2be2e493b7e5f4b12f1ab Mon Sep 17 00:00:00 2001
From: Magnus Breder Birkenes <magnus.birkenes@nb.no>
Date: Wed, 26 Jun 2024 16:26:09 +0200
Subject: [PATCH 2/4] feature: add sparse matrix option for counts

---
 dhlab/api/dhlab_api.py  | 37 +++++++++++++++++++++++++++++++++++--
 dhlab/text/conc_coll.py |  5 +++--
 dhlab/text/corpus.py    |  8 ++++----
 3 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/dhlab/api/dhlab_api.py b/dhlab/api/dhlab_api.py
index 83ced28..49482ec 100644
--- a/dhlab/api/dhlab_api.py
+++ b/dhlab/api/dhlab_api.py
@@ -8,6 +8,7 @@
 from pandas import DataFrame, Series
 
 from dhlab.constants import BASE_URL
+from scipy.sparse import dok_matrix
 
 pd.options.display.max_rows = 100
 
@@ -512,9 +513,35 @@ def ngram_news(
     # df.index = df.index.map(pd.Timestamp)
     return df
 
+def create_sparse_matrix(structure):
+    """Create a sparse matrix from an API counts object"""
+
+    # fetch all words
+    words = list(set(word for dct in structure.values() for word in dct))
+    # fetch all dhlabids
+    dhlabids = list(structure.keys())
+    # create an int/dhlabid mapping
+    dhlabid_to_col = {dhlabid: idx for idx, dhlabid in enumerate(dhlabids)}
+    # create an int/word mapping
+    word_to_row = {word: idx for idx, word in enumerate(words)}
+
+    # construct the matrix with each word as a row and each dhlabid as a column (DTM)
+    num_cols = len(dhlabids)
+    num_rows = len(words)
+    sparse_matrix = dok_matrix((num_rows, num_cols), dtype=int)
+
+    # incrementally fill the sparse matrix from dictionary
+    for col_idx, dhlabid in enumerate(dhlabids):
+        dct = structure[dhlabid]
+        for word, value in dct.items():
+            row_idx = word_to_row[word]
+            sparse_matrix[row_idx, col_idx] = value
+
+    df_sparse = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=words, columns=dhlabids)
+    return df_sparse
 
 def get_document_frequencies(
-    urns: List[str] = None, cutoff: int = 0, words: List[str] = None
+    urns: List[str] = None, cutoff: int = 0, words: List[str] = None, sparse: bool = False
 ) -> DataFrame:
     """Fetch frequency counts of ``words`` in documents (``urns``).
 
@@ -525,6 +552,7 @@ def get_document_frequencies(
         ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
     :param int cutoff: minimum frequency of a word to be counted
     :param list words: a list of words to be counted - if left None, whole document is returned. If not None both the counts and their relative frequency is returned.
+    :param bool sparse: create a sparse matrix for memory efficiency
     """
     params = locals()
     r = requests.post(f"{BASE_URL}/frequencies", json=params)
@@ -537,7 +565,12 @@ def get_document_frequencies(
                 structure[u[0][0]] = dict([(x[1], x[2]) for x in u])
             except IndexError:
                 pass
-        df = pd.DataFrame(structure)
+
+        if sparse == True:
+            df = create_sparse_matrix(structure)
+        else:
+            df = pd.DataFrame(structure)
+        
         df = df.sort_values(by=df.columns[0], ascending=False).fillna(0)
     else:
         df = pd.DataFrame(result)
diff --git a/dhlab/text/conc_coll.py b/dhlab/text/conc_coll.py
index cf805e7..6e3e634 100644
--- a/dhlab/text/conc_coll.py
+++ b/dhlab/text/conc_coll.py
@@ -168,12 +168,13 @@ def from_df(cls, df):
 class Counts(DhlabObj):
     """Provide counts for a corpus - shouldn't be too large"""
 
-    def __init__(self, corpus=None, words=None, cutoff=0):
+    def __init__(self, corpus=None, words=None, cutoff=0, sparse=False):
         """Get frequency list for Corpus
 
         :param corpus: target Corpus, defaults to None
         :param words: list of words to be counted, defaults to None
         :param cutoff: frequency cutoff, will not include words with frequency < cutoff
+        :param sparse: return a sparse matrix for memory efficiency
         """
         if corpus is None and words is None:
             self.freq = pd.DataFrame()
@@ -190,7 +191,7 @@ def __init__(self, corpus=None, words=None, cutoff=0):
             # count - if words is none result will be as if counting all words
             # in the corpus
             self.freq = get_document_frequencies(
-                urns=urnlist(corpus), cutoff=cutoff, words=words
+                urns=urnlist(corpus), cutoff=cutoff, words=words, sparse=sparse
             )
 
             # Include dhlab and title link in object
diff --git a/dhlab/text/corpus.py b/dhlab/text/corpus.py
index 0426102..361ab44 100644
--- a/dhlab/text/corpus.py
+++ b/dhlab/text/corpus.py
@@ -218,13 +218,13 @@ def coll(
             ignore_caps=ignore_caps,
         )
 
-    def count(self, words=None, cutoff=0):
+    def count(self, words=None, cutoff=0, sparse=False):
         """Get word frequencies for corpus"""
-        return dh.Counts(self, words, cutoff)
+        return dh.Counts(self, words, cutoff, sparse)
 
-    def freq(self, words=None, cutoff=0):
+    def freq(self, words=None, cutoff=0, sparse=False):
         """Get word frequencies for corpus"""
-        return dh.Counts(self, words, cutoff)
+        return dh.Counts(self, words, cutoff, sparse)
 
     @staticmethod
     def _is_Corpus(corpus: "Corpus") -> bool:

From f1ce7e8fbb7a5b72ff79ae043b22f170de0f8c8f Mon Sep 17 00:00:00 2001
From: Magnus Breder Birkenes <magnus.birkenes@nb.no>
Date: Wed, 26 Jun 2024 22:24:59 +0200
Subject: [PATCH 3/4] add sparsity check; sum functionality for sparse matrices

---
 dhlab/text/conc_coll.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/dhlab/text/conc_coll.py b/dhlab/text/conc_coll.py
index 6e3e634..4d476f9 100644
--- a/dhlab/text/conc_coll.py
+++ b/dhlab/text/conc_coll.py
@@ -209,12 +209,34 @@ def __init__(self, corpus=None, words=None, cutoff=0, sparse=False):
 
         super().__init__(self.freq)
 
+    def is_sparse(self):
+        """Function to report sparsity of counts frame"""
+        try:
+            density = self.freq.sparse.density
+            if density:
+                sparse = True
+            else:
+                sparse = False
+        except:
+            sparse = False
+        return sparse
+
     def sum(self):
         """Summarize Corpus frequencies
-
         :return: frequency list for Corpus
         """
-        return self.from_df(self.counts.sum(axis=1).to_frame("freq"))
+
+        # Needed since Pandas seems to make sparse matrices dense when summing
+        if self.is_sparse() == True:
+            freqDict = dict()
+            for x in self.freq.iterrows():
+                freqDict[x[0]] = x[1].sum()
+
+            df = pd.DataFrame(freqDict.items(), columns=["word", "freq"]).set_index("word")
+            df.index.name = None
+            return self.from_df(df)
+        else:
+            return self.from_df(self.counts.sum(axis=1).to_frame("freq"))
 
     def display_names(self):
         "Display data with record names as column titles."

From cece7fe36f92738d22236f42ea7ffdd38e82c35c Mon Sep 17 00:00:00 2001
From: Magnus Breder Birkenes <magnus.birkenes@nb.no>
Date: Wed, 26 Jun 2024 23:12:08 +0200
Subject: [PATCH 4/4] summing over coo matrices (memory improvement)

---
 dhlab/text/conc_coll.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/dhlab/text/conc_coll.py b/dhlab/text/conc_coll.py
index 4d476f9..5091660 100644
--- a/dhlab/text/conc_coll.py
+++ b/dhlab/text/conc_coll.py
@@ -228,11 +228,23 @@ def sum(self):
 
         # Needed since Pandas seems to make sparse matrices dense when summing
         if self.is_sparse() == True:
+            # convert to coo matrix for iteration
+            coo_matrix = self.freq.sparse.to_coo()
+
+            # get the words and their indices
+            rowidx = {idx: word for idx, word in enumerate(list(self.freq.index))}
+
+            # build a freq dictionary by looping
             freqDict = dict()
-            for x in self.freq.iterrows():
-                freqDict[x[0]] = x[1].sum()
 
-            df = pd.DataFrame(freqDict.items(), columns=["word", "freq"]).set_index("word")
+            for i,j,v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data):
+                word = rowidx[i]
+                if word in freqDict:
+                    freqDict[word] += v
+                else:
+                    freqDict[word] = v
+
+            df = pd.DataFrame(freqDict.items(), columns=["word", "freq"]).set_index("word").sort_values(by="freq", ascending=False)
             df.index.name = None
             return self.from_df(df)
         else: