Skip to content

Commit

Permalink
summing over coo matrices (memory improvement)
Browse files Browse the repository at this point in the history
  • Loading branch information
magbb committed Jun 26, 2024
1 parent f1ce7e8 commit cece7fe
Showing 1 changed file with 15 additions and 3 deletions.
18 changes: 15 additions & 3 deletions dhlab/text/conc_coll.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,23 @@ def sum(self):

# Needed since Pandas seems to make sparse matrices dense when summing
if self.is_sparse() == True:
# convert to coo matrix for iteration
coo_matrix = self.freq.sparse.to_coo()

# get the words and their indices
rowidx = {idx: word for idx, word in enumerate(list(self.freq.index))}

# build a freq dictionary by looping
freqDict = dict()
for x in self.freq.iterrows():
freqDict[x[0]] = x[1].sum()

df = pd.DataFrame(freqDict.items(), columns=["word", "freq"]).set_index("word")
for i,j,v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data):
word = rowidx[i]
if word in freqDict:
freqDict[word] += v
else:
freqDict[word] = v

df = pd.DataFrame(freqDict.items(), columns=["word", "freq"]).set_index("word").sort_values(by="freq", ascending=False)
df.index.name = None
return self.from_df(df)
else:
Expand Down

0 comments on commit cece7fe

Please sign in to comment.