Skip to content

Commit

Permalink
Merge pull request #209 from NationalLibraryOfNorway/feat/sparse
Browse files Browse the repository at this point in the history
Feat/sparse
  • Loading branch information
magbb authored Jun 27, 2024
2 parents 4960f04 + cece7fe commit 0c9b97a
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 11 deletions.
37 changes: 35 additions & 2 deletions dhlab/api/dhlab_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pandas import DataFrame, Series

from dhlab.constants import BASE_URL
from scipy.sparse import dok_matrix

pd.options.display.max_rows = 100

Expand Down Expand Up @@ -512,9 +513,35 @@ def ngram_news(
# df.index = df.index.map(pd.Timestamp)
return df

def create_sparse_matrix(structure):
"""Create a sparse matrix from an API counts object"""

# fetch all words
words = list(set(word for dct in structure.values() for word in dct))
# fetch all dhlabids
dhlabids = list(structure.keys())
# create an int/dhlabid mapping
dhlabid_to_col = {dhlabid: idx for idx, dhlabid in enumerate(dhlabids)}
# create an int/word mapping
word_to_row = {word: idx for idx, word in enumerate(words)}

# construct the matrix with each word as a row and each dhlabid as a column (DTM)
num_cols = len(dhlabids)
num_rows = len(words)
sparse_matrix = dok_matrix((num_rows, num_cols), dtype=int)

# incrementally fill the sparse matrix from dictionary
for col_idx, dhlabid in enumerate(dhlabids):
dct = structure[dhlabid]
for word, value in dct.items():
row_idx = word_to_row[word]
sparse_matrix[row_idx, col_idx] = value

df_sparse = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=words, columns=dhlabids)
return df_sparse

def get_document_frequencies(
urns: List[str] = None, cutoff: int = 0, words: List[str] = None
urns: List[str] = None, cutoff: int = 0, words: List[str] = None, sparse: bool = False
) -> DataFrame:
"""Fetch frequency counts of ``words`` in documents (``urns``).
Expand All @@ -525,6 +552,7 @@ def get_document_frequencies(
``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
:param int cutoff: minimum frequency of a word to be counted
:param list words: a list of words to be counted - if left None, whole document is returned. If not None both the counts and their relative frequency is returned.
:param bool sparse: create a sparse matrix for memory efficiency
"""
params = locals()
r = requests.post(f"{BASE_URL}/frequencies", json=params)
Expand All @@ -537,7 +565,12 @@ def get_document_frequencies(
structure[u[0][0]] = dict([(x[1], x[2]) for x in u])
except IndexError:
pass
df = pd.DataFrame(structure)

if sparse == True:
df = create_sparse_matrix(structure)
else:
df = pd.DataFrame(structure)

df = df.sort_values(by=df.columns[0], ascending=False).fillna(0)
else:
df = pd.DataFrame(result)
Expand Down
45 changes: 40 additions & 5 deletions dhlab/text/conc_coll.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,13 @@ def from_df(cls, df):
class Counts(DhlabObj):
"""Provide counts for a corpus - shouldn't be too large"""

def __init__(self, corpus=None, words=None, cutoff=0):
def __init__(self, corpus=None, words=None, cutoff=0, sparse=False):
"""Get frequency list for Corpus
:param corpus: target Corpus, defaults to None
:param words: list of words to be counted, defaults to None
:param cutoff: frequency cutoff, will not include words with frequency <= cutoff
:param cutoff: frequency cutoff, will not include words with frequency < cutoff
:param sparse: return a sparse matrix for memory efficiency
"""
if corpus is None and words is None:
self.freq = pd.DataFrame()
Expand All @@ -190,7 +191,7 @@ def __init__(self, corpus=None, words=None, cutoff=0):
# count - if words is none result will be as if counting all words
# in the corpus
self.freq = get_document_frequencies(
urns=urnlist(corpus), cutoff=cutoff, words=words
urns=urnlist(corpus), cutoff=cutoff, words=words, sparse=sparse
)

# Include dhlab and title link in object
Expand All @@ -208,12 +209,46 @@ def __init__(self, corpus=None, words=None, cutoff=0):

super().__init__(self.freq)

def is_sparse(self):
"""Function to report sparsity of counts frame"""
try:
density = self.freq.sparse.density
if density:
sparse = True
else:
sparse = False
except:
sparse = False
return sparse

def sum(self):
"""Summarize Corpus frequencies
:return: frequency list for Corpus
"""
return self.from_df(self.counts.sum(axis=1).to_frame("freq"))

# Needed since Pandas seems to make sparse matrices dense when summing
if self.is_sparse() == True:
# convert to coo matrix for iteration
coo_matrix = self.freq.sparse.to_coo()

# get the words and their indices
rowidx = {idx: word for idx, word in enumerate(list(self.freq.index))}

# build a freq dictionary by looping
freqDict = dict()

for i,j,v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data):
word = rowidx[i]
if word in freqDict:
freqDict[word] += v
else:
freqDict[word] = v

df = pd.DataFrame(freqDict.items(), columns=["word", "freq"]).set_index("word").sort_values(by="freq", ascending=False)
df.index.name = None
return self.from_df(df)
else:
return self.from_df(self.counts.sum(axis=1).to_frame("freq"))

def display_names(self):
"Display data with record names as column titles."
Expand Down
8 changes: 4 additions & 4 deletions dhlab/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,13 @@ def coll(
ignore_caps=ignore_caps,
)

def count(self, words=None, cutoff=0):
def count(self, words=None, cutoff=0, sparse=False):
"""Get word frequencies for corpus"""
return dh.Counts(self, words, cutoff)
return dh.Counts(self, words, cutoff, sparse)

def freq(self, words=None, cutoff=0):
def freq(self, words=None, cutoff=0, sparse=False):
"""Get word frequencies for corpus"""
return dh.Counts(self, words, cutoff)
return dh.Counts(self, words, cutoff, sparse)

@staticmethod
def _is_Corpus(corpus: "Corpus") -> bool:
Expand Down

0 comments on commit 0c9b97a

Please sign in to comment.