From c7fe35ecc63693ca6243e6aa0b5e7928945a1167 Mon Sep 17 00:00:00 2001
From: Matthieu Heitz <heitz.matthieu@gmail.com>
Date: Thu, 11 Mar 2021 14:48:00 -0800
Subject: [PATCH] Implement masking to control how embedded points are updated

---
 umap/layouts.py         | 307 ++++++++++++++++++++++++++++++++++++++++
 umap/parametric_umap.py |   8 +-
 umap/umap_.py           | 104 +++++++++++---
 3 files changed, 395 insertions(+), 24 deletions(-)

diff --git a/umap/layouts.py b/umap/layouts.py
index c3d79fc7..6d447627 100644
--- a/umap/layouts.py
+++ b/umap/layouts.py
@@ -181,6 +181,135 @@ def _optimize_layout_euclidean_single_epoch(
             )
 
 
+def _optimize_layout_euclidean_masked_single_epoch(
+    head_embedding,
+    tail_embedding,
+    head,
+    tail,
+    mask,
+    n_vertices,
+    epochs_per_sample,
+    a,
+    b,
+    rng_state,
+    gamma,
+    dim,
+    move_other,
+    alpha,
+    epochs_per_negative_sample,
+    epoch_of_next_negative_sample,
+    epoch_of_next_sample,
+    n,
+    densmap_flag,
+    dens_phi_sum,
+    dens_re_sum,
+    dens_re_cov,
+    dens_re_std,
+    dens_re_mean,
+    dens_lambda,
+    dens_R,
+    dens_mu,
+    dens_mu_tot,
+):
+    for i in numba.prange(epochs_per_sample.shape[0]):
+        if epoch_of_next_sample[i] <= n:
+            j = head[i]
+            k = tail[i]
+
+            current = head_embedding[j]
+            other = tail_embedding[k]
+
+            current_mask = mask[j]
+            other_mask = mask[k]
+
+            dist_squared = rdist(current, other)
+
+            if densmap_flag:
+                phi = 1.0 / (1.0 + a * pow(dist_squared, b))
+                dphi_term = (
+                    a * b * pow(dist_squared, b - 1) / (1.0 + a * pow(dist_squared, b))
+                )
+
+                q_jk = phi / dens_phi_sum[k]
+                q_kj = phi / dens_phi_sum[j]
+
+                drk = q_jk * (
+                    (1.0 - b * (1 - phi)) / np.exp(dens_re_sum[k]) + dphi_term
+                )
+                drj = q_kj * (
+                    (1.0 - b * (1 - phi)) / np.exp(dens_re_sum[j]) + dphi_term
+                )
+
+                re_std_sq = dens_re_std * dens_re_std
+                weight_k = (
+                    dens_R[k]
+                    - dens_re_cov * (dens_re_sum[k] - dens_re_mean) / re_std_sq
+                )
+                weight_j = (
+                    dens_R[j]
+                    - dens_re_cov * (dens_re_sum[j] - dens_re_mean) / re_std_sq
+                )
+
+                grad_cor_coeff = (
+                    dens_lambda
+                    * dens_mu_tot
+                    * (weight_k * drk + weight_j * drj)
+                    / (dens_mu[i] * dens_re_std)
+                    / n_vertices
+                )
+
+            if dist_squared > 0.0:
+                grad_coeff = -2.0 * a * b * pow(dist_squared, b - 1.0)
+                grad_coeff /= a * pow(dist_squared, b) + 1.0
+            else:
+                grad_coeff = 0.0
+
+            for d in range(dim):
+                grad_d = clip(grad_coeff * (current[d] - other[d]))
+
+                if densmap_flag:
+                    grad_d += clip(2 * grad_cor_coeff * (current[d] - other[d]))
+
+                current[d] += current_mask * grad_d * alpha
+                if move_other:
+                    other[d] += - other_mask * grad_d * alpha
+
+            epoch_of_next_sample[i] += epochs_per_sample[i]
+
+            n_neg_samples = int(
+                (n - epoch_of_next_negative_sample[i]) / epochs_per_negative_sample[i]
+            )
+
+            for p in range(n_neg_samples):
+                k = tau_rand_int(rng_state) % n_vertices
+
+                other = tail_embedding[k]
+
+                dist_squared = rdist(current, other)
+
+                if dist_squared > 0.0:
+                    grad_coeff = 2.0 * gamma * b
+                    grad_coeff /= (0.001 + dist_squared) * (
+                        a * pow(dist_squared, b) + 1
+                    )
+                elif j == k:
+                    continue
+                else:
+                    grad_coeff = 0.0
+
+                for d in range(dim):
+                    if grad_coeff > 0.0:
+                        grad_d = clip(grad_coeff * (current[d] - other[d]))
+                    else:
+                        grad_d = 4.0
+                    current[d] += current_mask * grad_d * alpha
+
+            epoch_of_next_negative_sample[i] += (
+                n_neg_samples * epochs_per_negative_sample[i]
+            )
+
+
+
 def _optimize_layout_euclidean_densmap_epoch_init(
     head_embedding, tail_embedding, head, tail, a, b, re_sum, phi_sum,
 ):
@@ -379,6 +508,184 @@ def optimize_layout_euclidean(
     return head_embedding
 
 
+def optimize_layout_euclidean_masked(
+    head_embedding,
+    tail_embedding,
+    head,
+    tail,
+    mask,
+    n_epochs,
+    n_vertices,
+    epochs_per_sample,
+    a,
+    b,
+    rng_state,
+    gamma=1.0,
+    initial_alpha=1.0,
+    negative_sample_rate=5.0,
+    parallel=False,
+    verbose=False,
+    densmap=False,
+    densmap_kwds={},
+):
+    """Improve an embedding using stochastic gradient descent to minimize the
+    fuzzy set cross entropy between the 1-skeletons of the high dimensional
+    and low dimensional fuzzy simplicial sets. In practice this is done by
+    sampling edges based on their membership strength (with the (1-p) terms
+    coming from negative sampling similar to word2vec).
+    Parameters
+    ----------
+    head_embedding: array of shape (n_samples, n_components)
+        The initial embedding to be improved by SGD.
+    tail_embedding: array of shape (source_samples, n_components)
+        The reference embedding of embedded points. If not embedding new
+        previously unseen points with respect to an existing embedding this
+        is simply the head_embedding (again); otherwise it provides the
+        existing embedding to embed with respect to.
+    head: array of shape (n_1_simplices)
+        The indices of the heads of 1-simplices with non-zero membership.
+    tail: array of shape (n_1_simplices)
+        The indices of the tails of 1-simplices with non-zero membership.
+    mask: array of shape (n_samples)
+        The weights (in [0,1]) assigned to each sample, defining how much they
+        should be updated. 0 means the point will not move at all, 1 means
+        they are updated normally. In-between values allow for fine-tuning.
+    n_epochs: int
+        The number of training epochs to use in optimization.
+    n_vertices: int
+        The number of vertices (0-simplices) in the dataset.
+    epochs_per_samples: array of shape (n_1_simplices)
+        A float value of the number of epochs per 1-simplex. 1-simplices with
+        weaker membership strength will have more epochs between being sampled.
+    a: float
+        Parameter of differentiable approximation of right adjoint functor
+    b: float
+        Parameter of differentiable approximation of right adjoint functor
+    rng_state: array of int64, shape (3,)
+        The internal state of the rng
+    gamma: float (optional, default 1.0)
+        Weight to apply to negative samples.
+    initial_alpha: float (optional, default 1.0)
+        Initial learning rate for the SGD.
+    negative_sample_rate: int (optional, default 5)
+        Number of negative samples to use per positive sample.
+    parallel: bool (optional, default False)
+        Whether to run the computation using numba parallel.
+        Running in parallel is non-deterministic, and is not used
+        if a random seed has been set, to ensure reproducibility.
+    verbose: bool (optional, default False)
+        Whether to report information on the current progress of the algorithm.
+    densmap: bool (optional, default False)
+        Whether to use the density-augmented densMAP objective
+    densmap_kwds: dict (optional, default {})
+        Auxiliary data for densMAP
+    Returns
+    -------
+    embedding: array of shape (n_samples, n_components)
+        The optimized embedding.
+    """
+
+    dim = head_embedding.shape[1]
+    move_other = head_embedding.shape[0] == tail_embedding.shape[0]
+    alpha = initial_alpha
+
+    epochs_per_negative_sample = epochs_per_sample / negative_sample_rate
+    epoch_of_next_negative_sample = epochs_per_negative_sample.copy()
+    epoch_of_next_sample = epochs_per_sample.copy()
+
+    optimize_fn = numba.njit(
+        _optimize_layout_euclidean_masked_single_epoch, fastmath=True, parallel=parallel
+    )
+
+    if densmap:
+        dens_init_fn = numba.njit(
+            _optimize_layout_euclidean_densmap_epoch_init,
+            fastmath=True,
+            parallel=parallel,
+        )
+
+        dens_mu_tot = np.sum(densmap_kwds["mu_sum"]) / 2
+        dens_lambda = densmap_kwds["lambda"]
+        dens_R = densmap_kwds["R"]
+        dens_mu = densmap_kwds["mu"]
+        dens_phi_sum = np.zeros(n_vertices, dtype=np.float32)
+        dens_re_sum = np.zeros(n_vertices, dtype=np.float32)
+        dens_var_shift = densmap_kwds["var_shift"]
+    else:
+        dens_mu_tot = 0
+        dens_lambda = 0
+        dens_R = np.zeros(1, dtype=np.float32)
+        dens_mu = np.zeros(1, dtype=np.float32)
+        dens_phi_sum = np.zeros(1, dtype=np.float32)
+        dens_re_sum = np.zeros(1, dtype=np.float32)
+
+    for n in range(n_epochs):
+
+        densmap_flag = (
+            densmap
+            and (densmap_kwds["lambda"] > 0)
+            and (((n + 1) / float(n_epochs)) > (1 - densmap_kwds["frac"]))
+        )
+
+        if densmap_flag:
+            dens_init_fn(
+                head_embedding,
+                tail_embedding,
+                head,
+                tail,
+                a,
+                b,
+                dens_re_sum,
+                dens_phi_sum,
+            )
+
+            dens_re_std = np.sqrt(np.var(dens_re_sum) + dens_var_shift)
+            dens_re_mean = np.mean(dens_re_sum)
+            dens_re_cov = np.dot(dens_re_sum, dens_R) / (n_vertices - 1)
+        else:
+            dens_re_std = 0
+            dens_re_mean = 0
+            dens_re_cov = 0
+
+        optimize_fn(
+            head_embedding,
+            tail_embedding,
+            head,
+            tail,
+            mask,
+            n_vertices,
+            epochs_per_sample,
+            a,
+            b,
+            rng_state,
+            gamma,
+            dim,
+            move_other,
+            alpha,
+            epochs_per_negative_sample,
+            epoch_of_next_negative_sample,
+            epoch_of_next_sample,
+            n,
+            densmap_flag,
+            dens_phi_sum,
+            dens_re_sum,
+            dens_re_cov,
+            dens_re_std,
+            dens_re_mean,
+            dens_lambda,
+            dens_R,
+            dens_mu,
+            dens_mu_tot,
+        )
+
+        alpha = initial_alpha * (1.0 - (float(n) / float(n_epochs)))
+
+        if verbose and n % int(n_epochs / 10) == 0:
+            print("\tcompleted ", n, " / ", n_epochs, "epochs")
+
+    return head_embedding
+
+
 @numba.njit(fastmath=True)
 def optimize_layout_generic(
     head_embedding,
diff --git a/umap/parametric_umap.py b/umap/parametric_umap.py
index ee0f2f88..9ffed9d0 100644
--- a/umap/parametric_umap.py
+++ b/umap/parametric_umap.py
@@ -358,7 +358,7 @@ def _compile_model(self):
             run_eagerly=self.run_eagerly,
         )
 
-    def _fit_embed_data(self, X, n_epochs, init, random_state):
+    def _fit_embed_data(self, X, n_epochs, init, random_state, pin_mask):
 
         if self.metric == "precomputed":
             X = self._X
@@ -371,6 +371,12 @@ def _fit_embed_data(self, X, n_epochs, init, random_state):
             if len(self.dims) > 1:
                 X = np.reshape(X, [len(X)] + list(self.dims))
 
+        if pin_mask is not None:
+            warn(
+                "Pinning is not yet supported by Parametric UMAP.\
+                Ignoring the pin_mask."
+            )
+
         if self.parametric_reconstruction and (np.max(X) > 1.0 or np.min(X) < 0.0):
             warn(
                 "Data should be scaled to the range 0-1 for cross-entropy reconstruction loss."
diff --git a/umap/umap_.py b/umap/umap_.py
index 6d1db932..1a64b762 100644
--- a/umap/umap_.py
+++ b/umap/umap_.py
@@ -40,6 +40,7 @@
 from umap.spectral import spectral_layout
 from umap.layouts import (
     optimize_layout_euclidean,
+    optimize_layout_euclidean_masked,
     optimize_layout_generic,
     optimize_layout_inverse,
 )
@@ -927,6 +928,7 @@ def simplicial_set_embedding(
     a,
     b,
     gamma,
+    pin_mask,
     negative_sample_rate,
     n_epochs,
     init,
@@ -972,6 +974,15 @@ def simplicial_set_embedding(
     gamma: float
         Weight to apply to negative samples.
 
+    pin_mask : array, shape (n_samples) or None
+        A mask used for pinning points in the embedding. It should be an array
+        of weights in [0,1] (one weight per point), defining how much points
+        will be updated from their initial position: 0 means the point will be
+        pinned (fixed), 1 means it will be updated normally, and in-between
+        values allow for soft-pinning. This argument is useful when providing a
+        numpy array for the initial embedding positions (``init`` parameter of
+        the ``UMAP`` class).
+
     negative_sample_rate: int (optional, default 5)
         The number of negative samples to select per positive sample
         in the optimization process. Increasing this value will result
@@ -1148,25 +1159,47 @@ def simplicial_set_embedding(
     ).astype(np.float32, order="C")
 
     if euclidean_output:
-        embedding = optimize_layout_euclidean(
-            embedding,
-            embedding,
-            head,
-            tail,
-            n_epochs,
-            n_vertices,
-            epochs_per_sample,
-            a,
-            b,
-            rng_state,
-            gamma,
-            initial_alpha,
-            negative_sample_rate,
-            parallel=parallel,
-            verbose=verbose,
-            densmap=densmap,
-            densmap_kwds=densmap_kwds,
-        )
+        if pin_mask is not None:
+            embedding = optimize_layout_euclidean_masked(
+                embedding,
+                embedding,
+                head,
+                tail,
+                pin_mask,
+                n_epochs,
+                n_vertices,
+                epochs_per_sample,
+                a,
+                b,
+                rng_state,
+                gamma,
+                initial_alpha,
+                negative_sample_rate,
+                parallel=parallel,
+                verbose=verbose,
+                densmap=densmap,
+                densmap_kwds=densmap_kwds,
+            )
+        else:
+            embedding = optimize_layout_euclidean(
+                embedding,
+                embedding,
+                head,
+                tail,
+                n_epochs,
+                n_vertices,
+                epochs_per_sample,
+                a,
+                b,
+                rng_state,
+                gamma,
+                initial_alpha,
+                negative_sample_rate,
+                parallel=parallel,
+                verbose=verbose,
+                densmap=densmap,
+                densmap_kwds=densmap_kwds,
+            )
     else:
         embedding = optimize_layout_generic(
             embedding,
@@ -1997,6 +2030,7 @@ def __mul__(self, other):
             np.mean(result._a),
             np.mean(result._b),
             np.mean(result.repulsion_strength),
+            None,
             np.mean(result.negative_sample_rate),
             n_epochs,
             init,
@@ -2066,6 +2100,7 @@ def __add__(self, other):
             np.mean(result._a),
             np.mean(result._b),
             np.mean(result.repulsion_strength),
+            None,
             np.mean(result.negative_sample_rate),
             n_epochs,
             init,
@@ -2137,6 +2172,7 @@ def __sub__(self, other):
             np.mean(result._a),
             np.mean(result._b),
             np.mean(result.repulsion_strength),
+            None,
             np.mean(result.negative_sample_rate),
             n_epochs,
             init,
@@ -2156,7 +2192,7 @@ def __sub__(self, other):
 
         return result
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, pin_mask=None):
         """Fit X into an embedded space.
 
         Optionally use y for supervised dimension reduction.
@@ -2174,6 +2210,15 @@ def fit(self, X, y=None):
             handled is determined by parameters UMAP was instantiated with.
             The relevant attributes are ``target_metric`` and
             ``target_metric_kwds``.
+
+        pin_mask : array, shape (n_samples) or None
+            A mask used for pinning points in the embedding. It should be an array
+            of weights in [0,1] (one weight per point), defining how much points
+            will be updated from their initial position: 0 means the point will be
+            pinned (fixed), 1 means it will be updated normally, and in-between
+            values allow for soft-pinning. This argument is useful when providing a
+            numpy array for the initial embedding positions (``init`` parameter of
+            the ``UMAP`` class).
         """
 
         X = check_array(X, dtype=np.float32, accept_sparse="csr", order="C")
@@ -2584,6 +2629,7 @@ def fit(self, X, y=None):
                 self.n_epochs,
                 init,
                 random_state,  # JH why raw data?
+                pin_mask
             )
             # Assign any points that are fully disconnected from our manifold(s) to have embedding
             # coordinates of np.nan.  These will be filtered by our plotting functions automatically.
@@ -2608,7 +2654,7 @@ def fit(self, X, y=None):
 
         return self
 
-    def _fit_embed_data(self, X, n_epochs, init, random_state):
+    def _fit_embed_data(self, X, n_epochs, init, random_state, pin_mask):
         """A method wrapper for simplicial_set_embedding that can be
         replaced by subclasses.
         """
@@ -2620,6 +2666,7 @@ def _fit_embed_data(self, X, n_epochs, init, random_state):
             self._a,
             self._b,
             self.repulsion_strength,
+            pin_mask,
             self.negative_sample_rate,
             n_epochs,
             init,
@@ -2636,7 +2683,7 @@ def _fit_embed_data(self, X, n_epochs, init, random_state):
             self.verbose,
         )
 
-    def fit_transform(self, X, y=None):
+    def fit_transform(self, X, y=None, pin_mask=None):
         """Fit X into an embedded space and return that transformed
         output.
 
@@ -2652,6 +2699,15 @@ def fit_transform(self, X, y=None):
             The relevant attributes are ``target_metric`` and
             ``target_metric_kwds``.
 
+        pin_mask : array, shape (n_samples) or None
+            A mask used for pinning points in the embedding. It should be an array
+            of weights in [0,1] (one weight per point), defining how much points
+            will be updated from their initial position: 0 means the point will be
+            pinned (fixed), 1 means it will be updated normally, and in-between
+            values allow for soft-pinning. This argument is useful when providing a
+            numpy array for the initial embedding positions (``init`` parameter of
+            the ``UMAP`` class).
+
         Returns
         -------
         X_new : array, shape (n_samples, n_components)
@@ -2666,7 +2722,7 @@ def fit_transform(self, X, y=None):
         r_emb: array, shape (n_samples)
             Local radii of data points in the embedding (log-transformed).
         """
-        self.fit(X, y)
+        self.fit(X, y, pin_mask)
         if self.transform_mode == "embedding":
             if self.output_dens:
                 return self.embedding_, self.rad_orig_, self.rad_emb_
@@ -3161,6 +3217,7 @@ def update(self, X):
                 self._a,
                 self._b,
                 self.repulsion_strength,
+                None,
                 self.negative_sample_rate,
                 n_epochs,
                 init,
@@ -3226,6 +3283,7 @@ def update(self, X):
                 self._a,
                 self._b,
                 self.repulsion_strength,
+                None,
                 self.negative_sample_rate,
                 n_epochs,
                 init,