Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement subsample_without_replacement using C++17 std::sample #934

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 99 additions & 28 deletions biom/_subsample.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,28 @@
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------


# distutils: language = c++
#
import numpy as np
cimport numpy as cnp

cdef extern from "_subsample_cpp.cpp":
pass

def _subsample(arr, n, with_replacement, rng):
"""Subsample non-zero values of a sparse array
cdef extern from "_subsample_cpp.hpp":
cdef cppclass WeightedSample:
WeightedSample(unsigned int _max_count, unsigned long _n, unsigned int random_seed)
void do_sample(double* data_arr, int start, int end)

def _subsample_with_replacement(arr, n, rng):
"""Subsample non-zero values of a sparse array with replacement

Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
n : int
Number of items to subsample from `arr`
with_replacement : bool
Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng
Expand All @@ -39,33 +45,98 @@ def _subsample(arr, n, with_replacement, rng):
cdef:
cnp.int64_t counts_sum
cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
cnp.ndarray[cnp.int64_t, ndim=1] data_i = arr.data.astype(np.int64)
cnp.ndarray[cnp.float64_t, ndim=1] result
cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
cnp.float64_t cnt
Py_ssize_t i, j, length
Py_ssize_t i, length

for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
counts_sum = data[start:end].sum()

if with_replacement:
pvals = data[start:end] / counts_sum
data[start:end] = rng.multinomial(n, pvals)
else:
if counts_sum < n:
data[start:end] = 0
continue

r = np.arange(length, dtype=np.int32)
unpacked = np.repeat(r, data_i[start:end])
permuted = rng.permutation(unpacked)[:n]

result = np.zeros(length, dtype=np.float64)
for idx in range(permuted.shape[0]):
result[permuted[idx]] += 1

data[start:end] = result
pvals = data[start:end] / counts_sum
data[start:end] = rng.multinomial(n, pvals)


def _subsample_without_replacement(arr, n, rng):
"""Subsample non-zero values of a sparse array w/out replacement

Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
n : int
Number of items to subsample from `arr`
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng

Returns
-------
ndarray
Subsampled data

Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)

"""
cdef:
cnp.int64_t counts_sum
cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
cnp.ndarray[cnp.int32_t, ndim=1] lengths
Py_ssize_t i
cnp.uint32_t length,max_len
cnp.uint64_t cn = n
WeightedSample *sample_data

lengths = np.empty(indptr.shape[0] - 1, dtype=np.int32)
for i in range(indptr.shape[0] - 1):
start, end = indptr[i], indptr[i+1]
length = end - start
lengths[i] = length
counts_sum = data[start:end].sum()
if counts_sum < n:
data[start:end] = 0
length = 0 # special value to signal to skip
lengths[i] = length

max_len = lengths.max()

sample_data = new WeightedSample(max_len, cn,
rng.integers(0,2**32, dtype=np.uint32))
for i in range(indptr.shape[0] - 1):
if lengths[i]==0:
continue
sample_data.do_sample(&data[0], indptr[i], indptr[i+1])


def _subsample(arr, n, with_replacement, rng):
"""Subsample non-zero values of a sparse array

Parameters
----------
arr : {csr_matrix, csc_matrix}
A 1xM sparse vector
n : int
Number of items to subsample from `arr`
with_replacement : bool
Whether to permute or use multinomial sampling
rng : Generator instance
A random generator. This will likely be an instance returned
by np.random.default_rng

Returns
-------
ndarray
Subsampled data

Notes
-----
This code was adapted from scikit-bio (`skbio.math._subsample`)

"""
if (with_replacement):
return _subsample_with_replacement(arr, n, rng)
else:
return _subsample_without_replacement(arr, n, rng)
107 changes: 107 additions & 0 deletions biom/_subsample_cpp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// -----------------------------------------------------------------------------
// Copyright (c) 2023-2023, The BIOM Format Development Team.
//
// Distributed under the terms of the Modified BSD License.
//
// The full license is in the file COPYING.txt, distributed with this software.
// -----------------------------------------------------------------------------

#include "_subsample_cpp.hpp"
#include <algorithm>

// Adapted from unifrac-binaries code:
// https://github.com/biocore/unifrac-binaries/blob/ba11b1b80c56ae13dff6b6e364352abb2f2b0faa/src/biom_subsampled.cpp#L115

// Equivalent to iterator over np.repeat
// https://github.com/biocore/biom-format/blob/b0e71a00ecb349a6f5f1ca64a23d71f380ddc19c/biom/_subsample.pyx#LL64C24-L64C55
class WeightedSampleIterator
{
public:
// While we do not implememnt the whole random_access_iterator interface
// we want the implementations to use operator- and that requires random
using iterator_category = std::random_access_iterator_tag;
using difference_type = int64_t;
using value_type = uint32_t;
using pointer = const uint32_t*;
using reference = const uint32_t&;

WeightedSampleIterator(uint64_t *_data_in, uint32_t _idx, uint64_t _cnt)
: data_in(_data_in)
, idx(_idx)
, cnt(_cnt)
{}

reference operator*() const { return idx; }
pointer operator->() const { return &idx; }

WeightedSampleIterator& operator++()
{
cnt++;
if (cnt>=data_in[idx]) {
cnt = 0;
idx++;
}
return *this;
}

WeightedSampleIterator operator++(int) { WeightedSampleIterator tmp = *this; ++(*this); return tmp; }

friend bool operator== (const WeightedSampleIterator& a, const WeightedSampleIterator& b)
{
return (a.data_in == b.data_in) && (a.idx == b.idx) && (a.cnt==b.cnt);
};

friend bool operator!= (const WeightedSampleIterator& a, const WeightedSampleIterator& b)
{
return !((a.data_in == b.data_in) && (a.idx == b.idx) && (a.cnt==b.cnt));
};

friend int64_t operator-(const WeightedSampleIterator& b, const WeightedSampleIterator& a)
{
int64_t diff = 0;
//assert(a.data_in == b.data_in);
//assert(a.idx <= b.idx);
//assert((a.idx > b.idx) || (a.cnt<=b.cnt));

for (uint32_t i = a.idx; i<b.idx; i++) {
diff += a.data_in[i];
}

return diff + b.cnt - a.cnt;
};

private:

uint64_t *data_in;
uint32_t idx; // index of data_in
uint64_t cnt; // how deep in data_in[idx] are we (must be < data_in[idx])
};


WeightedSample::WeightedSample(uint32_t _max_count, uint64_t _n, uint32_t random_seed)
: max_count(_max_count)
, n(_n)
, generator(random_seed)
, data(max_count)
, sample_out(n)
, data_out(max_count)
{}

void WeightedSample::do_sample(double* data_base, int start, int end) {
double* data_arr = data_base+start;
unsigned int length = end-start;
for (unsigned int j=0; j<length; j++) data_out[j] = 0;

// note: We are assuming length>=n
// Enforced by the caller (via filtering)
for (uint32_t j=0; j<length; j++) data[j] = data_arr[j];
std::sample(WeightedSampleIterator(data.data(),0,0),
WeightedSampleIterator(data.data(),length,0),
sample_out.begin(), n,
generator);

for (uint64_t j=0; j<n; j++) data_out[sample_out[j]]++;

for (unsigned int j=0; j<length; j++) data_arr[j] = data_out[j];
}

32 changes: 32 additions & 0 deletions biom/_subsample_cpp.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// -----------------------------------------------------------------------------
// Copyright (c) 2023-2023, The BIOM Format Development Team.
//
// Distributed under the terms of the Modified BSD License.
//
// The full license is in the file COPYING.txt, distributed with this software.
// -----------------------------------------------------------------------------

#ifndef _SUBSAMPLE_HPP
#define _SUBSAMPLE_HPP

#include <random>
#include <vector>

class WeightedSample
{
public:
WeightedSample(uint32_t _max_count, uint64_t _n, uint32_t random_seed);
void do_sample(double* data_arr, int start, int end);

private:
uint32_t max_count;
uint64_t n;
std::mt19937 generator;

// use persistent buffer to minimize allocation costs
std::vector<uint64_t> data; // original values
std::vector<uint32_t> sample_out; // random output buffer
std::vector<uint32_t> data_out; // computed values
};

#endif
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def run_tests(self):
include_dirs=[np.get_include()]),
Extension("biom._subsample",
["biom/_subsample" + ext],
extra_compile_args=["-std=c++17"],
include_dirs=[np.get_include()])]
extensions = cythonize(extensions)

Expand Down