From a54cd4b5b1649b626b6ad991197b340010f6f9e3 Mon Sep 17 00:00:00 2001
From: Scott Gigante <scottgigante@gmail.com>
Date: Wed, 6 Feb 2019 10:52:49 -0500
Subject: [PATCH] deprecate adaptive_k and matrix theta, fix tests

---
 graphtools/base.py   |  28 +++++++++++
 graphtools/graphs.py | 112 +++++--------------------------------------
 test/test_mnn.py     |  83 +++++++++++++++-----------------
 3 files changed, 79 insertions(+), 144 deletions(-)

diff --git a/graphtools/base.py b/graphtools/base.py
index 55d72c5..2dc8736 100644
--- a/graphtools/base.py
+++ b/graphtools/base.py
@@ -65,6 +65,9 @@ def _get_param_names(cls):
         return parameters
 
     def set_params(self, **kwargs):
+        # for k in kwargs:
+        #     raise TypeError("set_params() got an unexpected "
+        #                     "keyword argument '{}'".format(k))
         return self
 
 
@@ -866,3 +869,28 @@ def interpolate(self, transform, transitions=None, Y=None):
                 transitions = self.extend_to_data(Y)
         Y_transform = transitions.dot(transform)
         return Y_transform
+
+    def set_params(self, **params):
+        """Set parameters on this object
+
+        Safe setter method - attributes should not be modified directly as some
+        changes are not valid.
+        Valid parameters:
+        - n_jobs
+        - verbose
+
+        Parameters
+        ----------
+        params : key-value pairs of parameter name and new values
+
+        Returns
+        -------
+        self
+        """
+        if 'n_jobs' in params:
+            self.n_jobs = params['n_jobs']
+        if 'verbose' in params:
+            self.verbose = params['verbose']
+            tasklogger.set_level(self.verbose)
+        super().set_params(**params)
+        return self
diff --git a/graphtools/graphs.py b/graphtools/graphs.py
index 29653a3..298bc7e 100644
--- a/graphtools/graphs.py
+++ b/graphtools/graphs.py
@@ -2,12 +2,12 @@
 from builtins import super
 import numpy as np
 from sklearn.neighbors import NearestNeighbors
-from scipy.spatial.distance import pdist, cdist
-from scipy.spatial.distance import squareform
 from sklearn.utils.extmath import randomized_svd
 from sklearn.preprocessing import normalize
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.utils.graph import graph_shortest_path
+from scipy.spatial.distance import pdist, cdist
+from scipy.spatial.distance import squareform
 from scipy import sparse
 import numbers
 import warnings
@@ -1038,8 +1038,8 @@ class MNNGraph(DataGraph):
 
     def __init__(self, data, sample_idx,
                  knn=5, beta=1, n_pca=None,
-                 adaptive_k=None,
                  decay=None,
+                 adaptive_k=None,
                  bandwidth=None,
                  distance='euclidean',
                  thresh=1e-4,
@@ -1049,14 +1049,12 @@ def __init__(self, data, sample_idx,
         self.sample_idx = sample_idx
         self.samples, self.n_cells = np.unique(
             self.sample_idx, return_counts=True)
-        self.adaptive_k = adaptive_k
         self.knn = knn
         self.decay = decay
         self.distance = distance
         self.bandwidth = bandwidth
         self.thresh = thresh
         self.n_jobs = n_jobs
-        self.weighted_knn = self._weight_knn()
 
         if sample_idx is None:
             raise ValueError("sample_idx must be given. For a graph without"
@@ -1068,78 +1066,25 @@ def __init__(self, data, sample_idx,
         elif len(self.samples) == 1:
             raise ValueError(
                 "sample_idx must contain more than one unique value")
+        if adaptive_k is not None:
+            warnings.warn("`adaptive_k` has been deprecated. Using fixed knn.",
+                          DeprecationWarning)
 
         super().__init__(data, n_pca=n_pca, **kwargs)
 
     def _check_symmetrization(self, kernel_symm, theta):
         if kernel_symm == 'theta' and theta is not None and \
                 not isinstance(theta, numbers.Number):
-            # matrix theta
-            try:
-                theta.shape
-            except AttributeError:
-                raise ValueError("theta {} not recognized. "
-                                 "Expected a float between 0 and 1 "
-                                 "or a [n_batch,n_batch] matrix of "
-                                 "floats between 0 and 1".format(theta))
-            if not np.shape(theta) == (len(self.samples),
-                                       len(self.samples)):
-                raise ValueError(
-                    "Matrix theta must be of shape "
-                    "({}), got ({})".format(
-                        (len(self.samples),
-                         len(self.samples)), theta.shape))
-            elif np.max(theta) > 1 or np.min(theta) < 0:
-                raise ValueError(
-                    "Values in matrix theta must be between"
-                    " 0 and 1, got values between {} and {}".format(
-                        np.max(theta), np.min(theta)))
-            elif np.any(theta != theta.T):
-                raise ValueError("theta must be a symmetric matrix")
+            raise TypeError("Expected `theta` as a float. "
+                            "Got {}.".format(type(theta)))
         else:
             super()._check_symmetrization(kernel_symm, theta)
 
-    def _weight_knn(self, sample_size=None):
-        """Select adaptive values of knn
-
-        Parameters
-        ----------
-
-        sample_size : `int` or `None`
-            Number of cells in the sample in question. Used only for
-            out-of-sample extension. If `None`, calculates within-sample
-            knn values.
-
-        Returns
-        -------
-
-        knn : array-like or `int`, weighted knn values
-        """
-        if sample_size is None:
-            # calculate within sample knn values
-            sample_size = self.n_cells
-        if self.adaptive_k == 'min':
-            # the smallest sample has k
-            knn_weight = self.n_cells / np.min(self.n_cells)
-        elif self.adaptive_k == 'mean':
-            # the average sample has k
-            knn_weight = self.n_cells / np.mean(self.n_cells)
-        elif self.adaptive_k == 'sqrt':
-            # the samples are sqrt'd first, then smallest has k
-            knn_weight = np.sqrt(self.n_cells / np.min(self.n_cells))
-        elif self.adaptive_k is None:
-            knn_weight = np.repeat(1, len(self.n_cells))
-        weighted_knn = np.round(self.knn * knn_weight).astype(np.int32)
-        if len(weighted_knn) == 1:
-            weighted_knn = weighted_knn[0]
-        return weighted_knn
-
     def get_params(self):
         """Get parameters from this object
         """
         params = super().get_params()
         params.update({'beta': self.beta,
-                       'adaptive_k': self.adaptive_k,
                        'knn': self.knn,
                        'decay': self.decay,
                        'bandwidth': self.bandwidth,
@@ -1176,9 +1121,6 @@ def set_params(self, **params):
         # mnn specific arguments
         if 'beta' in params and params['beta'] != self.beta:
             raise ValueError("Cannot update beta. Please create a new graph")
-        if 'adaptive_k' in params and params['adaptive_k'] != self.adaptive_k:
-            raise ValueError(
-                "Cannot update adaptive_k. Please create a new graph")
 
         # knn arguments
         knn_kernel_args = ['knn', 'decay', 'distance', 'thresh', 'bandwidth']
@@ -1216,12 +1158,12 @@ def build_kernel(self):
             tasklogger.log_debug("subgraph {}: sample {}, "
                                  "n = {}, knn = {}".format(
                                      i, idx, np.sum(self.sample_idx == idx),
-                                     self.weighted_knn[i]))
+                                     self.knn))
             # select data for sample
             data = self.data_nu[self.sample_idx == idx]
             # build a kNN graph for cells within sample
             graph = Graph(data, n_pca=None,
-                          knn=self.weighted_knn[i],
+                          knn=self.knn,
                           decay=self.decay,
                           bandwidth=self.bandwidth,
                           distance=self.distance,
@@ -1229,6 +1171,7 @@ def build_kernel(self):
                           verbose=self.verbose,
                           random_state=self.random_state,
                           n_jobs=self.n_jobs,
+                          kernel_symm='+',
                           initialize=True)
             self.subgraphs.append(graph)  # append to list of subgraphs
         tasklogger.log_complete("subgraphs")
@@ -1251,7 +1194,7 @@ def build_kernel(self):
                                                          self.samples[j]))
                 Kij = Y.build_kernel_to_data(
                     X.data_nu,
-                    knn=self.weighted_knn[i])
+                    knn=self.knn)
                 between_batch_norm = np.array(np.sum(Kij, 1)).flatten()
                 scale = np.minimum(1, within_batch_norm /
                                    between_batch_norm) * self.beta
@@ -1267,37 +1210,6 @@ def build_kernel(self):
         tasklogger.log_complete("MNN kernel")
         return K
 
-    def symmetrize_kernel(self, K):
-        if self.kernel_symm == 'theta' and self.theta is not None and \
-                not isinstance(self.theta, numbers.Number):
-            # matrix theta
-            # Theta can be a matrix with specific values transitions for
-            # each batch. This allows for technical replicates and
-            # experimental samples to be corrected simultaneously
-            tasklogger.log_debug("Using theta symmetrization. "
-                                 "Theta:\n{}".format(self.theta))
-            for i, sample_i in enumerate(self.samples):
-                for j, sample_j in enumerate(self.samples):
-                    if j < i:
-                        continue
-                    Kij = K[np.ix_(self.sample_idx == sample_i,
-                                   self.sample_idx == sample_j)]
-                    Kji = K[np.ix_(self.sample_idx == sample_j,
-                                   self.sample_idx == sample_i)]
-                    Kij_symm = self.theta[i, j] * \
-                        elementwise_minimum(Kij, Kji.T) + \
-                        (1 - self.theta[i, j]) * \
-                        elementwise_maximum(Kij, Kji.T)
-                    K = set_submatrix(K, self.sample_idx == sample_i,
-                                      self.sample_idx == sample_j, Kij_symm)
-                    if not i == j:
-                        K = set_submatrix(K, self.sample_idx == sample_j,
-                                          self.sample_idx == sample_i,
-                                          Kij_symm.T)
-        else:
-            K = super().symmetrize_kernel(K)
-        return K
-
     def build_kernel_to_data(self, Y, theta=None):
         """Build transition matrix from new data to the graph
 
diff --git a/test/test_mnn.py b/test/test_mnn.py
index 5344e25..3cbdb2c 100644
--- a/test/test_mnn.py
+++ b/test/test_mnn.py
@@ -51,8 +51,8 @@ def test_build_mnn_with_precomputed():
     build_graph(data, n_pca=None, graphtype='mnn', precomputed='distance')
 
 
-@raises(ValueError)
-def test_mnn_with_square_theta_wrong_length():
+@raises(TypeError)
+def test_mnn_with_matrix_theta():
     n_sample = len(np.unique(digits['target']))
     # square matrix theta of the wrong size
     build_graph(
@@ -60,11 +60,11 @@ def test_mnn_with_square_theta_wrong_length():
         decay=10, knn=5, random_state=42,
         sample_idx=digits['target'],
         kernel_symm='theta',
-        theta=np.tile(np.linspace(0, 1, n_sample - 1),
-                      n_sample).reshape(n_sample - 1, n_sample))
+        theta=np.tile(np.linspace(0, 1, n_sample),
+                      n_sample).reshape(n_sample, n_sample))
 
 
-@raises(ValueError)
+@raises(TypeError)
 def test_mnn_with_vector_theta():
     n_sample = len(np.unique(digits['target']))
     # vector theta
@@ -86,7 +86,7 @@ def test_mnn_with_unbounded_theta():
         theta=2)
 
 
-@raises(ValueError)
+@raises(TypeError)
 def test_mnn_with_string_theta():
     build_graph(
         data, thresh=0, n_pca=20,
@@ -135,6 +135,16 @@ def test_mnn_with_kernel_symmm_theta_and_no_theta():
         kernel_symm='theta')
 
 
+@warns(DeprecationWarning)
+def test_mnn_adaptive_k():
+    build_graph(
+        data, thresh=0, n_pca=20,
+        decay=10, knn=5, random_state=42,
+        sample_idx=digits['target'],
+        kernel_symm='theta',
+        theta=0.9, adaptive_k='sqrt')
+
+
 def test_mnn_with_non_zero_indexed_sample_idx():
     X, sample_idx = generate_swiss_roll()
     G = build_graph(X, sample_idx=sample_idx,
@@ -171,14 +181,13 @@ def test_mnn_with_string_sample_idx():
 # Check kernel
 #####################################################
 
-
-def test_mnn_graph_float_theta():
+def test_mnn_graph_no_decay():
     X, sample_idx = generate_swiss_roll()
     theta = 0.9
     k = 10
-    a = 20
+    a = None
     metric = 'euclidean'
-    beta = 0.5
+    beta = 0.2
     samples = np.unique(sample_idx)
 
     K = np.zeros((len(X), len(X)))
@@ -188,18 +197,19 @@ def test_mnn_graph_float_theta():
     for si in samples:
         X_i = X[sample_idx == si]            # get observations in sample i
         for sj in samples:
+            batch_k = k + 1 if si == sj else k
             X_j = X[sample_idx == sj]        # get observation in sample j
             pdx_ij = cdist(X_i, X_j, metric=metric)  # pairwise distances
             kdx_ij = np.sort(pdx_ij, axis=1)  # get kNN
-            e_ij = kdx_ij[:, k]             # dist to kNN
-            pdxe_ij = pdx_ij / e_ij[:, np.newaxis]  # normalize
-            k_ij = np.exp(-1 * (pdxe_ij ** a))  # apply alpha-decaying kernel
+            e_ij = kdx_ij[:, batch_k - 1]             # dist to kNN
+            k_ij = np.where(pdx_ij <= e_ij[:, None], 1, 0)  # apply knn kernel
             if si == sj:
                 K.iloc[sample_idx == si, sample_idx == sj] = (
                     k_ij + k_ij.T) / 2
             else:
                 # fill out values in K for NN on diagonal
                 K.iloc[sample_idx == si, sample_idx == sj] = k_ij
+
     Kn = K.copy()
     for i in samples:
         curr_K = K.iloc[sample_idx == i, sample_idx == i]
@@ -210,10 +220,9 @@ def test_mnn_graph_float_theta():
             else:
                 curr_K = K.iloc[sample_idx == i, sample_idx == j]
                 curr_norm = norm(curr_K, 1, axis=1)
-                scale = np.minimum(
-                    np.ones(len(curr_norm)), i_norm / curr_norm) * beta
-                Kn.iloc[sample_idx == i, sample_idx == j] = (
-                    curr_K.T * scale).T
+                scale = np.minimum(1, i_norm / curr_norm) * beta
+                Kn.iloc[sample_idx == i,
+                        sample_idx == j] = curr_K.values * scale[:, None]
 
     K = Kn
     W = np.array((theta * np.minimum(K, K.T)) +
@@ -230,15 +239,13 @@ def test_mnn_graph_float_theta():
     assert isinstance(G2, graphtools.graphs.MNNGraph)
 
 
-def test_mnn_graph_matrix_theta():
+def test_mnn_graph_decay():
     X, sample_idx = generate_swiss_roll()
-    bs = 0.8
-    theta = np.array([[1, bs],  # 0
-                      [bs,  1]])  # 3
+    theta = 0.9
     k = 10
     a = 20
     metric = 'euclidean'
-    beta = 0
+    beta = 0.2
     samples = np.unique(sample_idx)
 
     K = np.zeros((len(X), len(X)))
@@ -248,10 +255,11 @@ def test_mnn_graph_matrix_theta():
     for si in samples:
         X_i = X[sample_idx == si]            # get observations in sample i
         for sj in samples:
+            batch_k = k if si == sj else k - 1
             X_j = X[sample_idx == sj]        # get observation in sample j
             pdx_ij = cdist(X_i, X_j, metric=metric)  # pairwise distances
             kdx_ij = np.sort(pdx_ij, axis=1)  # get kNN
-            e_ij = kdx_ij[:, k]             # dist to kNN
+            e_ij = kdx_ij[:, batch_k]             # dist to kNN
             pdxe_ij = pdx_ij / e_ij[:, np.newaxis]  # normalize
             k_ij = np.exp(-1 * (pdxe_ij ** a))  # apply alpha-decaying kernel
             if si == sj:
@@ -260,6 +268,7 @@ def test_mnn_graph_matrix_theta():
             else:
                 # fill out values in K for NN on diagonal
                 K.iloc[sample_idx == si, sample_idx == sj] = k_ij
+
     Kn = K.copy()
     for i in samples:
         curr_K = K.iloc[sample_idx == i, sample_idx == i]
@@ -270,23 +279,13 @@ def test_mnn_graph_matrix_theta():
             else:
                 curr_K = K.iloc[sample_idx == i, sample_idx == j]
                 curr_norm = norm(curr_K, 1, axis=1)
-                scale = np.minimum(
-                    np.ones(len(curr_norm)), i_norm / curr_norm) * beta
-                Kn.iloc[sample_idx == i, sample_idx == j] = (
-                    curr_K.T * scale).T
+                scale = np.minimum(1, i_norm / curr_norm) * beta
+                Kn.iloc[sample_idx == i,
+                        sample_idx == j] = curr_K.values * scale[:, None]
 
     K = Kn
-
-    K = np.array(K)
-
-    matrix_theta = pd.DataFrame(np.zeros((len(sample_idx), len(sample_idx))))
-    for ix, si in enumerate(set(sample_idx)):
-        for jx, sj in enumerate(set(sample_idx)):
-            matrix_theta.iloc[sample_idx == si,
-                              sample_idx == sj] = theta[ix, jx]
-
-    W = np.array((matrix_theta * np.minimum(K, K.T)) +
-                 ((1 - matrix_theta) * np.maximum(K, K.T)))
+    W = np.array((theta * np.minimum(K, K.T)) +
+                 ((1 - theta) * np.maximum(K, K.T)))
     np.fill_diagonal(W, 0)
     G = pygsp.graphs.Graph(W)
     G2 = graphtools.Graph(X, knn=k, decay=a, beta=beta,
@@ -295,8 +294,7 @@ def test_mnn_graph_matrix_theta():
                           use_pygsp=True)
     assert G.N == G2.N
     np.testing.assert_array_equal(G.dw, G2.dw)
-    assert (G.W != G2.W).nnz == 0
-    assert (G2.W != G.W).sum() == 0
+    np.testing.assert_array_equal((G.W - G2.W).data, 0)
     assert isinstance(G2, graphtools.graphs.MNNGraph)
 
 
@@ -329,7 +327,6 @@ def test_set_params():
         'theta': 0.5,
         'anisotropy': 0,
         'beta': 1,
-        'adaptive_k': None,
         'knn': 3,
         'decay': 10,
         'bandwidth': None,
@@ -356,10 +353,8 @@ def test_set_params():
     assert_raises(ValueError, G.set_params, distance='manhattan')
     assert_raises(ValueError, G.set_params, thresh=1e-3)
     assert_raises(ValueError, G.set_params, beta=0.2)
-    assert_raises(ValueError, G.set_params, adaptive_k='min')
     G.set_params(knn=G.knn,
                  decay=G.decay,
                  thresh=G.thresh,
                  distance=G.distance,
-                 beta=G.beta,
-                 adaptive_k=G.adaptive_k)
+                 beta=G.beta)