Merge pull request #209 from NationalLibraryOfNorway/feat/sparse

Feat/sparse
NationalLibraryOfNorway · Jun 27, 2024 · 0c9b97a · 0c9b97a
2 parents 4960f04 + cece7fe
commit 0c9b97a
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 11 deletions.
diff --git a/dhlab/api/dhlab_api.py b/dhlab/api/dhlab_api.py
@@ -8,6 +8,7 @@
 from pandas import DataFrame, Series
 
 from dhlab.constants import BASE_URL
+from scipy.sparse import dok_matrix
 
 pd.options.display.max_rows = 100
 
@@ -512,9 +513,35 @@ def ngram_news(
     # df.index = df.index.map(pd.Timestamp)
     return df
 
+def create_sparse_matrix(structure):
+    """Create a sparse matrix from an API counts object"""
+
+    # fetch all words
+    words = list(set(word for dct in structure.values() for word in dct))
+    # fetch all dhlabids
+    dhlabids = list(structure.keys())
+    # create an int/dhlabid mapping
+    dhlabid_to_col = {dhlabid: idx for idx, dhlabid in enumerate(dhlabids)}
+    # create an int/word mapping
+    word_to_row = {word: idx for idx, word in enumerate(words)}
+
+    # construct the matrix with each word as a row and each dhlabid as a column (DTM)
+    num_cols = len(dhlabids)
+    num_rows = len(words)
+    sparse_matrix = dok_matrix((num_rows, num_cols), dtype=int)
+
+    # incrementally fill the sparse matrix from dictionary
+    for col_idx, dhlabid in enumerate(dhlabids):
+        dct = structure[dhlabid]
+        for word, value in dct.items():
+            row_idx = word_to_row[word]
+            sparse_matrix[row_idx, col_idx] = value
+
+    df_sparse = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=words, columns=dhlabids)
+    return df_sparse
 
 def get_document_frequencies(
-    urns: List[str] = None, cutoff: int = 0, words: List[str] = None
+    urns: List[str] = None, cutoff: int = 0, words: List[str] = None, sparse: bool = False
 ) -> DataFrame:
     """Fetch frequency counts of ``words`` in documents (``urns``).
 
@@ -525,6 +552,7 @@ def get_document_frequencies(
         ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
     :param int cutoff: minimum frequency of a word to be counted
     :param list words: a list of words to be counted - if left None, whole document is returned. If not None both the counts and their relative frequency is returned.
+    :param bool sparse: create a sparse matrix for memory efficiency
     """
     params = locals()
     r = requests.post(f"{BASE_URL}/frequencies", json=params)
@@ -537,7 +565,12 @@ def get_document_frequencies(
                 structure[u[0][0]] = dict([(x[1], x[2]) for x in u])
             except IndexError:
                 pass
-        df = pd.DataFrame(structure)
+
+        if sparse == True:
+            df = create_sparse_matrix(structure)
+        else:
+            df = pd.DataFrame(structure)
+
         df = df.sort_values(by=df.columns[0], ascending=False).fillna(0)
     else:
         df = pd.DataFrame(result)

diff --git a/dhlab/text/conc_coll.py b/dhlab/text/conc_coll.py
@@ -168,12 +168,13 @@ def from_df(cls, df):
 class Counts(DhlabObj):
     """Provide counts for a corpus - shouldn't be too large"""
 
-    def __init__(self, corpus=None, words=None, cutoff=0):
+    def __init__(self, corpus=None, words=None, cutoff=0, sparse=False):
         """Get frequency list for Corpus
 
         :param corpus: target Corpus, defaults to None
         :param words: list of words to be counted, defaults to None
-        :param cutoff: frequency cutoff, will not include words with frequency <= cutoff
+        :param cutoff: frequency cutoff, will not include words with frequency < cutoff
+        :param sparse: return a sparse matrix for memory efficiency
         """
         if corpus is None and words is None:
             self.freq = pd.DataFrame()
@@ -190,7 +191,7 @@ def __init__(self, corpus=None, words=None, cutoff=0):
             # count - if words is none result will be as if counting all words
             # in the corpus
             self.freq = get_document_frequencies(
-                urns=urnlist(corpus), cutoff=cutoff, words=words
+                urns=urnlist(corpus), cutoff=cutoff, words=words, sparse=sparse
             )
 
             # Include dhlab and title link in object
@@ -208,12 +209,46 @@ def __init__(self, corpus=None, words=None, cutoff=0):
 
         super().__init__(self.freq)
 
+    def is_sparse(self):
+        """Function to report sparsity of counts frame"""
+        try:
+            density = self.freq.sparse.density
+            if density:
+                sparse = True
+            else:
+                sparse = False
+        except:
+            sparse = False
+        return sparse
+
     def sum(self):
         """Summarize Corpus frequencies
-
         :return: frequency list for Corpus
         """
-        return self.from_df(self.counts.sum(axis=1).to_frame("freq"))
+
+        # Needed since Pandas seems to make sparse matrices dense when summing
+        if self.is_sparse() == True:
+            # convert to coo matrix for iteration
+            coo_matrix = self.freq.sparse.to_coo()
+
+            # get the words and their indices
+            rowidx = {idx: word for idx, word in enumerate(list(self.freq.index))}
+
+            # build a freq dictionary by looping
+            freqDict = dict()
+
+            for i,j,v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data):
+                word = rowidx[i]
+                if word in freqDict:
+                    freqDict[word] += v
+                else:
+                    freqDict[word] = v
+
+            df = pd.DataFrame(freqDict.items(), columns=["word", "freq"]).set_index("word").sort_values(by="freq", ascending=False)
+            df.index.name = None
+            return self.from_df(df)
+        else:
+            return self.from_df(self.counts.sum(axis=1).to_frame("freq"))
 
     def display_names(self):
         "Display data with record names as column titles."

diff --git a/dhlab/text/corpus.py b/dhlab/text/corpus.py
@@ -218,13 +218,13 @@ def coll(
             ignore_caps=ignore_caps,
         )
 
-    def count(self, words=None, cutoff=0):
+    def count(self, words=None, cutoff=0, sparse=False):
         """Get word frequencies for corpus"""
-        return dh.Counts(self, words, cutoff)
+        return dh.Counts(self, words, cutoff, sparse)
 
-    def freq(self, words=None, cutoff=0):
+    def freq(self, words=None, cutoff=0, sparse=False):
         """Get word frequencies for corpus"""
-        return dh.Counts(self, words, cutoff)
+        return dh.Counts(self, words, cutoff, sparse)
 
     @staticmethod
     def _is_Corpus(corpus: "Corpus") -> bool: