Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use rng.choice without unpacked in subsample_without_replacement with 64-bit support #935

Merged
merged 15 commits into from
Jul 28, 2023
1 change: 1 addition & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ biom 2.1.15-dev
Performance improvements:

* Revise `Table._fast_merge` to use COO directly. For very large tables, this reduces runtime by ~50x and memory by ~5x. See PR [#913](https://github.com/biocore/biom-format/pull/933).
* Drastically reduce the memory needs of subsampling when sums are large. Also adds 64-bit support. See PR [#935](https://github.com/biocore/biom-format/pull/935).

biom 2.1.15
-----------
Expand Down
159 changes: 126 additions & 33 deletions biom/_subsample.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,23 @@
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------


import numpy as np
cimport numpy as cnp


def _subsample(arr, n, with_replacement, rng):
"""Subsample non-zero values of a sparse array
cdef _subsample_with_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
cnp.ndarray[cnp.int32_t, ndim=1] indptr,
cnp.int64_t n,
object rng):
"""Subsample non-zero values of a sparse array with replacement

Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
data : {csr_matrix, csc_matrix}.data
A 1xM sparse vector data
indptr : {csr_matrix, csc_matrix}.indptr
A 1xM sparse vector indptr
n : int
Number of items to subsample from `arr`
with_replacement : bool
Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng
Expand All @@ -38,34 +39,126 @@ def _subsample(arr, n, with_replacement, rng):
"""
cdef:
cnp.int64_t counts_sum
cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
cnp.ndarray[cnp.int64_t, ndim=1] data_i = arr.data.astype(np.int64)
cnp.ndarray[cnp.float64_t, ndim=1] result
cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
cnp.float64_t cnt
Py_ssize_t i, j, length
cnp.int32_t start,end,length
Py_ssize_t i
cnp.ndarray[cnp.float64_t, ndim=1] pvals

for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
counts_sum = data[start:end].sum()

if with_replacement:
pvals = data[start:end] / counts_sum
data[start:end] = rng.multinomial(n, pvals)
else:
if counts_sum < n:
data[start:end] = 0
continue

r = np.arange(length, dtype=np.int32)
unpacked = np.repeat(r, data_i[start:end])
permuted = rng.permutation(unpacked)[:n]

result = np.zeros(length, dtype=np.float64)
for idx in range(permuted.shape[0]):
result[permuted[idx]] += 1

data[start:end] = result
pvals = data[start:end] / counts_sum
data[start:end] = rng.multinomial(n, pvals)


cdef _subsample_without_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
cnp.ndarray[cnp.int32_t, ndim=1] indptr,
cnp.int64_t n,
object rng):
"""Subsample non-zero values of a sparse array w/out replacement

Parameters
----------
data : {csr_matrix, csc_matrix}.data
A 1xM sparse vector data
indptr : {csr_matrix, csc_matrix}.indptr
A 1xM sparse vector indptr
n : int
Number of items to subsample from `arr`
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng

Returns
-------
ndarray
Subsampled data

Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)

"""
cdef:
cnp.int64_t counts_sum, count_el, perm_count_el
cnp.int64_t count_rem
cnp.ndarray[cnp.int64_t, ndim=1] permuted
Py_ssize_t i, idx
cnp.int32_t length,el,start,end

for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
counts_sum = data[start:end].sum()

if counts_sum < n:
data[start:end] = 0
continue

permuted = rng.choice(counts_sum, n, replace=False, shuffle=False)
permuted.sort()

# now need to do reverse mapping
# since I am not using np.repeat anymore
# reminder, old logic was
# r = np.arange(length)
# unpacked = np.repeat(r, data_i[start:end])
# permuted_unpacked = rng.choice(unpacked, n, replace=False, shuffle=False)
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
#
# specifically, what we're going to do here is randomly pick what elements within
# each sample to keep. this is analogous issuing the prior np.repeat call, and obtaining
# a random set of index positions for that resulting array. however, we do not need to
# perform the np.repeat call as we know the length of that resulting vector already,
# and additionally, we can compute the sample associated with an index in that array
# without constructing it.

el = 0 # index in result/data
count_el = 0 # index in permutted
count_rem = long(data[start]) # since each data has multiple els, sub count there
data[start] = 0.0
for idx in range(n):
perm_count_el = permuted[idx]
sfiligoi marked this conversation as resolved.
Show resolved Hide resolved
# the array is sorted, so just jump ahead
while (perm_count_el - count_el) >= count_rem:
count_el += count_rem
el += 1
count_rem = long(data[start+el])
data[start+el] = 0.0
count_rem -= (perm_count_el - count_el)
count_el = perm_count_el

data[start+el] += 1
# clean up tail elements
data[start+el+1:end] = 0.0


def _subsample(arr, n, with_replacement, rng):
"""Subsample non-zero values of a sparse array

Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
n : int
Number of items to subsample from `arr`
with_replacement : bool
Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng

Returns
-------
ndarray
Subsampled data

Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)

"""
if (with_replacement):
return _subsample_with_replacement(arr.data, arr.indptr, n, rng)
else:
return _subsample_without_replacement(arr.data, arr.indptr, n, rng)
Loading