Skip to content

Commit

Permalink
Add chunk sizing options; ensure we sum duplicates.
Browse files Browse the repository at this point in the history
  • Loading branch information
lmcinnes committed Jun 25, 2020
1 parent 5c67a32 commit 50f309e
Showing 1 changed file with 14 additions and 0 deletions.
14 changes: 14 additions & 0 deletions vectorizers/_vectorizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,11 @@ def token_cooccurrence_matrix(
symmetric: counts tokens occurring before and after as the same tokens
directional: counts tokens before and after as different and returns both counts.
chunk_size: int (optional, default=1048576)
When processing token sequences, break the list of sequences into
chunks of this size to stream the data through, rather than storing all
the results at once. This saves on peak memory usage.
Returns
-------
cooccurrence_matrix: scipyr.sparse.csr_matrix
Expand Down Expand Up @@ -1004,6 +1009,7 @@ def token_cooccurrence_matrix(
shape=(n_unique_tokens, n_unique_tokens),
dtype=np.float32,
)
cooccurrence_matrix.sum_duplicates()

if window_orientation == "before":
cooccurrence_matrix = cooccurrence_matrix.transpose()
Expand Down Expand Up @@ -1205,6 +1211,11 @@ class TokenCooccurrenceVectorizer(BaseEstimator, TransformerMixin):
symmetric: counts tokens occurring before and after as the same tokens
directional: counts tokens before and after as different and returns both counts.
chunk_size: int (optional, default=1048576)
When processing token sequences, break the list of sequences into
chunks of this size to stream the data through, rather than storing all
the results at once. This saves on peak memory usage.
validate_data: bool (optional, default=True)
Check whether the data is valid (e.g. of homogeneous token type).
"""
Expand All @@ -1226,6 +1237,7 @@ def __init__(
kernel_function="flat",
window_radius=5,
window_orientation="directional",
chunk_size=1<<20,
validate_data=True,
):
self.token_dictionary = token_dictionary
Expand All @@ -1245,6 +1257,7 @@ def __init__(
self.window_radius = window_radius

self.window_orientation = window_orientation
self.chunk_size = chunk_size
self.validate_data = validate_data

def fit_transform(self, X, y=None, **fit_params):
Expand Down Expand Up @@ -1309,6 +1322,7 @@ def fit_transform(self, X, y=None, **fit_params):
window_args=(self._window_size, self._token_frequencies_),
kernel_args=(self.window_radius,),
window_orientation=self.window_orientation,
chunk_size=self.chunk_size,
)
self.cooccurrences_.eliminate_zeros()

Expand Down

0 comments on commit 50f309e

Please sign in to comment.