Faster merge (#933)

* PERF: list-based bisection for merge * Add some comments * Use coo * STY: flake8 * DOC: perf mention
biocore · May 12, 2023 · b0e71a0 · b0e71a0
1 parent ded189b
commit b0e71a0
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 43 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -1,6 +1,13 @@
 BIOM-Format ChangeLog
 =====================
 
+biom 2.1.15-dev
+---------------
+
+Performance improvements:
+
+* Revise `Table._fast_merge` to use COO directly. For very large tables, this reduces runtime by ~50x and memory by ~5x. See PR [#913](https://github.com/biocore/biom-format/pull/933).
+
 biom 2.1.15
 -----------
 

diff --git a/biom/table.py b/biom/table.py
@@ -178,7 +178,7 @@
 from datetime import datetime
 from json import dumps as _json_dumps, JSONEncoder
 from functools import reduce, partial
-from operator import itemgetter, or_
+from operator import itemgetter
 from collections import defaultdict
 from collections.abc import Hashable, Iterable
 from numpy import ndarray, asarray, zeros, newaxis
@@ -3638,54 +3638,57 @@ def _fast_merge(self, others):
         tables = [self] + others
 
         # gather all identifiers across tables
-        all_features = reduce(or_, [set(t.ids(axis='observation'))
-                                    for t in tables])
-        all_samples = reduce(or_, [set(t.ids()) for t in tables])
+        all_features = set(np.hstack([t.ids(axis='observation')
+                                      for t in tables]))
+        all_samples = set(np.hstack([t.ids() for t in tables]))
+
+        # produce a new stable order
+        feature_order = sorted(all_features)
+        sample_order = sorted(all_samples)
 
         # generate unique integer ids for the identifiers, and let's order
         # it to be polite
-        feature_map = {i: idx for idx, i in enumerate(sorted(all_features))}
-        sample_map = {i: idx for idx, i in enumerate(sorted(all_samples))}
+        feature_map = {i: idx for idx, i in enumerate(feature_order)}
+        sample_map = {i: idx for idx, i in enumerate(sample_order)}
 
-        # produce a new stable order
-        get1 = lambda x: x[1]  # noqa
-        feature_order = [k for k, v in sorted(feature_map.items(), key=get1)]
-        sample_order = [k for k, v in sorted(sample_map.items(), key=get1)]
+        ntuples = sum([t.nnz for t in tables])
 
-        mi = []
-        values = []
+        # we're going to aggregate in COO. per scipy, it is efficient for
+        # construction of large matrices. importantly, it allows for
+        # duplicates which in this case correspond to multiple values for
+        # the same sample/feature across tables. the duplicates are summed
+        # implicitly on conversion to csr/csc.
+        rows = np.empty(ntuples, dtype=np.int32)
+        cols = np.empty(ntuples, dtype=np.int32)
+        data = np.empty(ntuples, dtype=self.matrix_data.dtype)
+
+        offset = 0
         for table in tables:
-            # these data are effectively [((row_index, col_index), value), ]
-            data_as_dok = table.matrix_data.todok()
-
-            # construct a map of the feature integer index to what it is in
-            # the full table
-            feat_ids = table.ids(axis='observation')
-            samp_ids = table.ids()
-            table_features = {idx: feature_map[i]
-                              for idx, i in enumerate(feat_ids)}
-            table_samples = {idx: sample_map[i]
-                             for idx, i in enumerate(samp_ids)}
-
-            for (f, s), v in data_as_dok.items():
-                # collect the indices and values, adjusting the indices as we
-                # go
-                mi.append((table_features[f], table_samples[s]))
-                values.append(v)
-
-        # construct a multiindex of the indices where the outer index is the
-        # feature and the inner index is the sample
-        mi = pd.MultiIndex.from_tuples(mi)
-        grouped = pd.Series(values, index=mi)
-
-        # aggregate the values where the outer and inner values in the
-        # multiindex are the same
-        collapsed_rcv = grouped.groupby(level=[0, 1]).sum()
-
-        # convert into a representation understood by the Table constructor
-        list_list = [[r, c, v] for (r, c), v in collapsed_rcv.items()]
-
-        return self.__class__(list_list, feature_order, sample_order)
+            t_nnz = table.nnz
+
+            coo = table.matrix_data.tocoo()
+
+            # we need to map the index positions in the current table to the
+            # index positions in the full matrix
+            row_map = np.array([feature_map[i]
+                                for i in table.ids(axis='observation')],
+                               dtype=np.int32)
+            col_map = np.array([sample_map[i]
+                                for i in table.ids()],
+                               dtype=np.int32)
+            coo.row = row_map[coo.row]
+            coo.col = col_map[coo.col]
+
+            # store our coo data
+            rows[offset:offset + t_nnz] = coo.row
+            cols[offset:offset + t_nnz] = coo.col
+            data[offset:offset + t_nnz] = coo.data
+            offset += t_nnz
+
+        coo = coo_matrix((data, (rows, cols)),
+                         shape=(len(feature_order), len(sample_order)))
+
+        return self.__class__(coo.tocsr(), feature_order, sample_order)
 
     def merge(self, other, sample='union', observation='union',
               sample_metadata_f=prefer_self,