biocore · sfiligoi · Jun 9, 2023 · Jun 12, 2023 · Jun 13, 2023
diff --git a/biom/_subsample.pyx b/biom/_subsample.pyx
@@ -6,22 +6,28 @@
 # The full license is in the file COPYING.txt, distributed with this software.
 # -----------------------------------------------------------------------------
 
-
+# distutils: language = c++
+#
 import numpy as np
 cimport numpy as cnp
 
+cdef extern from "_subsample_cpp.cpp":
+    pass
 
-def _subsample(arr, n, with_replacement, rng):
-    """Subsample non-zero values of a sparse array
+cdef extern from "_subsample_cpp.hpp":
+    cdef cppclass WeightedSample:
+        WeightedSample(unsigned int _max_count, unsigned long _n, unsigned int random_seed)
+        void do_sample(double* data_arr, int start, int end)
+
+def _subsample_with_replacement(arr, n, rng):
+    """Subsample non-zero values of a sparse array with replacement
 
     Parameters
     ----------
     arr : {csr_matrix, csc_matrix}
         A 1xM sparse vector
     n : int
         Number of items to subsample from `arr`
-    with_replacement : bool
-        Whether to permute or use multinomial sampling
     rng : Generator instance
         A random generator. This will likely be an instance returned 
         by np.random.default_rng
@@ -39,33 +45,98 @@ def _subsample(arr, n, with_replacement, rng):
     cdef:
         cnp.int64_t counts_sum
         cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
-        cnp.ndarray[cnp.int64_t, ndim=1] data_i = arr.data.astype(np.int64)
-        cnp.ndarray[cnp.float64_t, ndim=1] result
-        cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
         cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
-        cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
-        cnp.float64_t cnt
-        Py_ssize_t i, j, length
+        Py_ssize_t i, length
 
     for i in range(indptr.shape[0] - 1):
         start, end = indptr[i], indptr[i+1]
         length = end - start
         counts_sum = data[start:end].sum()
 
-        if with_replacement:
-            pvals = data[start:end] / counts_sum
-            data[start:end] = rng.multinomial(n, pvals)
-        else:
-            if counts_sum < n:
-                data[start:end] = 0
-                continue
-
-            r = np.arange(length, dtype=np.int32)
-            unpacked = np.repeat(r, data_i[start:end])
-            permuted = rng.permutation(unpacked)[:n]
-
-            result = np.zeros(length, dtype=np.float64)
-            for idx in range(permuted.shape[0]):
-                result[permuted[idx]] += 1
-
-            data[start:end] = result
+        pvals = data[start:end] / counts_sum
+        data[start:end] = rng.multinomial(n, pvals)
+
+
+def _subsample_without_replacement(arr, n, rng):
+    """Subsample non-zero values of a sparse array w/out replacement
+
+    Parameters
+    ----------
+    arr : {csr_matrix, csc_matrix}
+        A 1xM sparse vector
+    n : int
+        Number of items to subsample from `arr`
+    rng : Generator instance
+        A random generator. This will likely be an instance returned 
+        by np.random.default_rng
+
+    Returns
+    -------
+    ndarray
+        Subsampled data
+
+    Notes
+    -----
+    This code was adapted from scikit-bio (`skbio.math._subsample`)
+
+    """
+    cdef:
+        cnp.int64_t counts_sum
+        cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
+        cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
+        cnp.ndarray[cnp.int32_t, ndim=1] lengths
+        Py_ssize_t i
+        cnp.uint32_t length,max_len
+        cnp.uint64_t cn = n
+        WeightedSample *sample_data
+
+    lengths = np.empty(indptr.shape[0] - 1, dtype=np.int32)
+    for i in range(indptr.shape[0] - 1):
+        start, end = indptr[i], indptr[i+1]
+        length = end - start
+        lengths[i] = length
+        counts_sum = data[start:end].sum()
+        if counts_sum < n:
+           data[start:end] = 0
+           length = 0 # special value to signal to skip
+        lengths[i] = length
+
+    max_len = lengths.max()
+
+    sample_data = new WeightedSample(max_len, cn,
+                                     rng.integers(0,2**32, dtype=np.uint32))
+    for i in range(indptr.shape[0] - 1):
+        if lengths[i]==0:
+            continue
+        sample_data.do_sample(&data[0], indptr[i], indptr[i+1])
+
+
+def _subsample(arr, n, with_replacement, rng):
+    """Subsample non-zero values of a sparse array
+
+    Parameters
+    ----------
+    arr : {csr_matrix, csc_matrix}
+        A 1xM sparse vector
+    n : int
+        Number of items to subsample from `arr`
+    with_replacement : bool
+        Whether to permute or use multinomial sampling
+    rng : Generator instance
+        A random generator. This will likely be an instance returned 
+        by np.random.default_rng
+
+    Returns
+    -------
+    ndarray
+        Subsampled data
+
+    Notes
+    -----
+    This code was adapted from scikit-bio (`skbio.math._subsample`)
+
+    """
+    if (with_replacement):
+       return _subsample_with_replacement(arr, n, rng)
+    else:
+       return _subsample_without_replacement(arr, n, rng)
diff --git a/biom/_subsample_cpp.cpp b/biom/_subsample_cpp.cpp
@@ -0,0 +1,107 @@
+// -----------------------------------------------------------------------------
+// Copyright (c) 2023-2023, The BIOM Format Development Team.
+//
+// Distributed under the terms of the Modified BSD License.
+//
+// The full license is in the file COPYING.txt, distributed with this software.
+// -----------------------------------------------------------------------------
+
+#include "_subsample_cpp.hpp"
+#include <algorithm>
+
+// Adapted from unifrac-binaries code:
+// https://github.com/biocore/unifrac-binaries/blob/ba11b1b80c56ae13dff6b6e364352abb2f2b0faa/src/biom_subsampled.cpp#L115
+
+// Equivalent to iterator over np.repeat
+// https://github.com/biocore/biom-format/blob/b0e71a00ecb349a6f5f1ca64a23d71f380ddc19c/biom/_subsample.pyx#LL64C24-L64C55
+class WeightedSampleIterator
+  {
+  public:
+    // While we do not implememnt the whole random_access_iterator interface
+    // we want the implementations to use operator- and that requires random
+    using iterator_category = std::random_access_iterator_tag;
+    using difference_type   = int64_t;
+    using value_type        = uint32_t;
+    using pointer           = const uint32_t*;
+    using reference         = const uint32_t&;
+
+    WeightedSampleIterator(uint64_t *_data_in, uint32_t _idx, uint64_t _cnt)
+    : data_in(_data_in)
+    , idx(_idx)
+    , cnt(_cnt)
+    {}
+
+    reference operator*() const { return idx; }
+    pointer operator->() const { return &idx; }
+
+    WeightedSampleIterator& operator++()
+    {  
+       cnt++;
+       if (cnt>=data_in[idx]) {
+         cnt = 0;
+         idx++;
+       }
+       return *this;
+    }
+
+    WeightedSampleIterator operator++(int) { WeightedSampleIterator tmp = *this; ++(*this); return tmp; }
+
+    friend bool operator== (const WeightedSampleIterator& a, const WeightedSampleIterator& b)
+    {
+       return (a.data_in == b.data_in) && (a.idx == b.idx) && (a.cnt==b.cnt);
+    };
+
+    friend bool operator!= (const WeightedSampleIterator& a, const WeightedSampleIterator& b)
+    {
+       return !((a.data_in == b.data_in) && (a.idx == b.idx) && (a.cnt==b.cnt));
+    };
+
+    friend int64_t operator-(const WeightedSampleIterator& b, const WeightedSampleIterator& a)
+    {
+       int64_t diff = 0;
+       //assert(a.data_in == b.data_in);
+       //assert(a.idx <= b.idx);
+       //assert((a.idx > b.idx) || (a.cnt<=b.cnt));
+
+       for (uint32_t i = a.idx; i<b.idx; i++) {
+          diff += a.data_in[i];
+       }
+
+       return diff + b.cnt - a.cnt;
+    };
+
+  private:
+
+    uint64_t *data_in;
+    uint32_t idx; // index of data_in
+    uint64_t cnt; // how deep in data_in[idx] are we (must be < data_in[idx])
+};
+
+
+WeightedSample::WeightedSample(uint32_t _max_count, uint64_t _n, uint32_t random_seed)
+    : max_count(_max_count)
+    , n(_n)
+    , generator(random_seed)
+    , data(max_count)
+    , sample_out(n)
+    , data_out(max_count)
+{}
+
+void WeightedSample::do_sample(double* data_base, int start, int end) {
+        double* data_arr = data_base+start;
+        unsigned int length = end-start;
+        for (unsigned int j=0; j<length; j++) data_out[j] = 0;
+
+        // note: We are assuming length>=n
+        //      Enforced by the caller (via filtering)
+        for (uint32_t j=0; j<length; j++) data[j] = data_arr[j];
+        std::sample(WeightedSampleIterator(data.data(),0,0),
+                    WeightedSampleIterator(data.data(),length,0),
+                    sample_out.begin(), n,
+                    generator);
+
+        for (uint64_t j=0; j<n; j++) data_out[sample_out[j]]++;
+
+        for (unsigned int j=0; j<length; j++) data_arr[j] = data_out[j];
+}
+
diff --git a/biom/_subsample_cpp.hpp b/biom/_subsample_cpp.hpp
@@ -0,0 +1,32 @@
+// -----------------------------------------------------------------------------
+// Copyright (c) 2023-2023, The BIOM Format Development Team.
+//
+// Distributed under the terms of the Modified BSD License.
+//
+// The full license is in the file COPYING.txt, distributed with this software.
+// -----------------------------------------------------------------------------
+
+#ifndef _SUBSAMPLE_HPP
+#define _SUBSAMPLE_HPP
+
+#include <random>
+#include <vector>
+
+class WeightedSample
+  {
+  public:
+   WeightedSample(uint32_t _max_count, uint64_t _n, uint32_t random_seed);
+   void do_sample(double* data_arr, int start, int end);
+
+private:
+    uint32_t max_count;
+    uint64_t n;
+    std::mt19937 generator;
+
+    // use persistent buffer to minimize allocation costs
+    std::vector<uint64_t> data;  // original values
+    std::vector<uint32_t> sample_out;     // random output buffer
+    std::vector<uint32_t> data_out; // computed values
+};
+
+#endif
diff --git a/setup.py b/setup.py
@@ -108,6 +108,7 @@ def run_tests(self):
                         include_dirs=[np.get_include()]),
               Extension("biom._subsample",
                         ["biom/_subsample" + ext],
+                        extra_compile_args=["-std=c++17"],
                         include_dirs=[np.get_include()])]
 extensions = cythonize(extensions)