From c7fe35ecc63693ca6243e6aa0b5e7928945a1167 Mon Sep 17 00:00:00 2001 From: Matthieu Heitz Date: Thu, 11 Mar 2021 14:48:00 -0800 Subject: [PATCH] Implement masking to control how embedded points are updated --- umap/layouts.py | 307 ++++++++++++++++++++++++++++++++++++++++ umap/parametric_umap.py | 8 +- umap/umap_.py | 104 +++++++++++--- 3 files changed, 395 insertions(+), 24 deletions(-) diff --git a/umap/layouts.py b/umap/layouts.py index c3d79fc7..6d447627 100644 --- a/umap/layouts.py +++ b/umap/layouts.py @@ -181,6 +181,135 @@ def _optimize_layout_euclidean_single_epoch( ) +def _optimize_layout_euclidean_masked_single_epoch( + head_embedding, + tail_embedding, + head, + tail, + mask, + n_vertices, + epochs_per_sample, + a, + b, + rng_state, + gamma, + dim, + move_other, + alpha, + epochs_per_negative_sample, + epoch_of_next_negative_sample, + epoch_of_next_sample, + n, + densmap_flag, + dens_phi_sum, + dens_re_sum, + dens_re_cov, + dens_re_std, + dens_re_mean, + dens_lambda, + dens_R, + dens_mu, + dens_mu_tot, +): + for i in numba.prange(epochs_per_sample.shape[0]): + if epoch_of_next_sample[i] <= n: + j = head[i] + k = tail[i] + + current = head_embedding[j] + other = tail_embedding[k] + + current_mask = mask[j] + other_mask = mask[k] + + dist_squared = rdist(current, other) + + if densmap_flag: + phi = 1.0 / (1.0 + a * pow(dist_squared, b)) + dphi_term = ( + a * b * pow(dist_squared, b - 1) / (1.0 + a * pow(dist_squared, b)) + ) + + q_jk = phi / dens_phi_sum[k] + q_kj = phi / dens_phi_sum[j] + + drk = q_jk * ( + (1.0 - b * (1 - phi)) / np.exp(dens_re_sum[k]) + dphi_term + ) + drj = q_kj * ( + (1.0 - b * (1 - phi)) / np.exp(dens_re_sum[j]) + dphi_term + ) + + re_std_sq = dens_re_std * dens_re_std + weight_k = ( + dens_R[k] + - dens_re_cov * (dens_re_sum[k] - dens_re_mean) / re_std_sq + ) + weight_j = ( + dens_R[j] + - dens_re_cov * (dens_re_sum[j] - dens_re_mean) / re_std_sq + ) + + grad_cor_coeff = ( + dens_lambda + * dens_mu_tot + * (weight_k * drk + weight_j * drj) + / (dens_mu[i] * dens_re_std) + / n_vertices + ) + + if dist_squared > 0.0: + grad_coeff = -2.0 * a * b * pow(dist_squared, b - 1.0) + grad_coeff /= a * pow(dist_squared, b) + 1.0 + else: + grad_coeff = 0.0 + + for d in range(dim): + grad_d = clip(grad_coeff * (current[d] - other[d])) + + if densmap_flag: + grad_d += clip(2 * grad_cor_coeff * (current[d] - other[d])) + + current[d] += current_mask * grad_d * alpha + if move_other: + other[d] += - other_mask * grad_d * alpha + + epoch_of_next_sample[i] += epochs_per_sample[i] + + n_neg_samples = int( + (n - epoch_of_next_negative_sample[i]) / epochs_per_negative_sample[i] + ) + + for p in range(n_neg_samples): + k = tau_rand_int(rng_state) % n_vertices + + other = tail_embedding[k] + + dist_squared = rdist(current, other) + + if dist_squared > 0.0: + grad_coeff = 2.0 * gamma * b + grad_coeff /= (0.001 + dist_squared) * ( + a * pow(dist_squared, b) + 1 + ) + elif j == k: + continue + else: + grad_coeff = 0.0 + + for d in range(dim): + if grad_coeff > 0.0: + grad_d = clip(grad_coeff * (current[d] - other[d])) + else: + grad_d = 4.0 + current[d] += current_mask * grad_d * alpha + + epoch_of_next_negative_sample[i] += ( + n_neg_samples * epochs_per_negative_sample[i] + ) + + + def _optimize_layout_euclidean_densmap_epoch_init( head_embedding, tail_embedding, head, tail, a, b, re_sum, phi_sum, ): @@ -379,6 +508,184 @@ def optimize_layout_euclidean( return head_embedding +def optimize_layout_euclidean_masked( + head_embedding, + tail_embedding, + head, + tail, + mask, + n_epochs, + n_vertices, + epochs_per_sample, + a, + b, + rng_state, + gamma=1.0, + initial_alpha=1.0, + negative_sample_rate=5.0, + parallel=False, + verbose=False, + densmap=False, + densmap_kwds={}, +): + """Improve an embedding using stochastic gradient descent to minimize the + fuzzy set cross entropy between the 1-skeletons of the high dimensional + and low dimensional fuzzy simplicial sets. In practice this is done by + sampling edges based on their membership strength (with the (1-p) terms + coming from negative sampling similar to word2vec). + Parameters + ---------- + head_embedding: array of shape (n_samples, n_components) + The initial embedding to be improved by SGD. + tail_embedding: array of shape (source_samples, n_components) + The reference embedding of embedded points. If not embedding new + previously unseen points with respect to an existing embedding this + is simply the head_embedding (again); otherwise it provides the + existing embedding to embed with respect to. + head: array of shape (n_1_simplices) + The indices of the heads of 1-simplices with non-zero membership. + tail: array of shape (n_1_simplices) + The indices of the tails of 1-simplices with non-zero membership. + mask: array of shape (n_samples) + The weights (in [0,1]) assigned to each sample, defining how much they + should be updated. 0 means the point will not move at all, 1 means + they are updated normally. In-between values allow for fine-tuning. + n_epochs: int + The number of training epochs to use in optimization. + n_vertices: int + The number of vertices (0-simplices) in the dataset. + epochs_per_samples: array of shape (n_1_simplices) + A float value of the number of epochs per 1-simplex. 1-simplices with + weaker membership strength will have more epochs between being sampled. + a: float + Parameter of differentiable approximation of right adjoint functor + b: float + Parameter of differentiable approximation of right adjoint functor + rng_state: array of int64, shape (3,) + The internal state of the rng + gamma: float (optional, default 1.0) + Weight to apply to negative samples. + initial_alpha: float (optional, default 1.0) + Initial learning rate for the SGD. + negative_sample_rate: int (optional, default 5) + Number of negative samples to use per positive sample. + parallel: bool (optional, default False) + Whether to run the computation using numba parallel. + Running in parallel is non-deterministic, and is not used + if a random seed has been set, to ensure reproducibility. + verbose: bool (optional, default False) + Whether to report information on the current progress of the algorithm. + densmap: bool (optional, default False) + Whether to use the density-augmented densMAP objective + densmap_kwds: dict (optional, default {}) + Auxiliary data for densMAP + Returns + ------- + embedding: array of shape (n_samples, n_components) + The optimized embedding. + """ + + dim = head_embedding.shape[1] + move_other = head_embedding.shape[0] == tail_embedding.shape[0] + alpha = initial_alpha + + epochs_per_negative_sample = epochs_per_sample / negative_sample_rate + epoch_of_next_negative_sample = epochs_per_negative_sample.copy() + epoch_of_next_sample = epochs_per_sample.copy() + + optimize_fn = numba.njit( + _optimize_layout_euclidean_masked_single_epoch, fastmath=True, parallel=parallel + ) + + if densmap: + dens_init_fn = numba.njit( + _optimize_layout_euclidean_densmap_epoch_init, + fastmath=True, + parallel=parallel, + ) + + dens_mu_tot = np.sum(densmap_kwds["mu_sum"]) / 2 + dens_lambda = densmap_kwds["lambda"] + dens_R = densmap_kwds["R"] + dens_mu = densmap_kwds["mu"] + dens_phi_sum = np.zeros(n_vertices, dtype=np.float32) + dens_re_sum = np.zeros(n_vertices, dtype=np.float32) + dens_var_shift = densmap_kwds["var_shift"] + else: + dens_mu_tot = 0 + dens_lambda = 0 + dens_R = np.zeros(1, dtype=np.float32) + dens_mu = np.zeros(1, dtype=np.float32) + dens_phi_sum = np.zeros(1, dtype=np.float32) + dens_re_sum = np.zeros(1, dtype=np.float32) + + for n in range(n_epochs): + + densmap_flag = ( + densmap + and (densmap_kwds["lambda"] > 0) + and (((n + 1) / float(n_epochs)) > (1 - densmap_kwds["frac"])) + ) + + if densmap_flag: + dens_init_fn( + head_embedding, + tail_embedding, + head, + tail, + a, + b, + dens_re_sum, + dens_phi_sum, + ) + + dens_re_std = np.sqrt(np.var(dens_re_sum) + dens_var_shift) + dens_re_mean = np.mean(dens_re_sum) + dens_re_cov = np.dot(dens_re_sum, dens_R) / (n_vertices - 1) + else: + dens_re_std = 0 + dens_re_mean = 0 + dens_re_cov = 0 + + optimize_fn( + head_embedding, + tail_embedding, + head, + tail, + mask, + n_vertices, + epochs_per_sample, + a, + b, + rng_state, + gamma, + dim, + move_other, + alpha, + epochs_per_negative_sample, + epoch_of_next_negative_sample, + epoch_of_next_sample, + n, + densmap_flag, + dens_phi_sum, + dens_re_sum, + dens_re_cov, + dens_re_std, + dens_re_mean, + dens_lambda, + dens_R, + dens_mu, + dens_mu_tot, + ) + + alpha = initial_alpha * (1.0 - (float(n) / float(n_epochs))) + + if verbose and n % int(n_epochs / 10) == 0: + print("\tcompleted ", n, " / ", n_epochs, "epochs") + + return head_embedding + + @numba.njit(fastmath=True) def optimize_layout_generic( head_embedding, diff --git a/umap/parametric_umap.py b/umap/parametric_umap.py index ee0f2f88..9ffed9d0 100644 --- a/umap/parametric_umap.py +++ b/umap/parametric_umap.py @@ -358,7 +358,7 @@ def _compile_model(self): run_eagerly=self.run_eagerly, ) - def _fit_embed_data(self, X, n_epochs, init, random_state): + def _fit_embed_data(self, X, n_epochs, init, random_state, pin_mask): if self.metric == "precomputed": X = self._X @@ -371,6 +371,12 @@ def _fit_embed_data(self, X, n_epochs, init, random_state): if len(self.dims) > 1: X = np.reshape(X, [len(X)] + list(self.dims)) + if pin_mask is not None: + warn( + "Pinning is not yet supported by Parametric UMAP.\ + Ignoring the pin_mask." + ) + if self.parametric_reconstruction and (np.max(X) > 1.0 or np.min(X) < 0.0): warn( "Data should be scaled to the range 0-1 for cross-entropy reconstruction loss." diff --git a/umap/umap_.py b/umap/umap_.py index 6d1db932..1a64b762 100644 --- a/umap/umap_.py +++ b/umap/umap_.py @@ -40,6 +40,7 @@ from umap.spectral import spectral_layout from umap.layouts import ( optimize_layout_euclidean, + optimize_layout_euclidean_masked, optimize_layout_generic, optimize_layout_inverse, ) @@ -927,6 +928,7 @@ def simplicial_set_embedding( a, b, gamma, + pin_mask, negative_sample_rate, n_epochs, init, @@ -972,6 +974,15 @@ def simplicial_set_embedding( gamma: float Weight to apply to negative samples. + pin_mask : array, shape (n_samples) or None + A mask used for pinning points in the embedding. It should be an array + of weights in [0,1] (one weight per point), defining how much points + will be updated from their initial position: 0 means the point will be + pinned (fixed), 1 means it will be updated normally, and in-between + values allow for soft-pinning. This argument is useful when providing a + numpy array for the initial embedding positions (``init`` parameter of + the ``UMAP`` class). + negative_sample_rate: int (optional, default 5) The number of negative samples to select per positive sample in the optimization process. Increasing this value will result @@ -1148,25 +1159,47 @@ def simplicial_set_embedding( ).astype(np.float32, order="C") if euclidean_output: - embedding = optimize_layout_euclidean( - embedding, - embedding, - head, - tail, - n_epochs, - n_vertices, - epochs_per_sample, - a, - b, - rng_state, - gamma, - initial_alpha, - negative_sample_rate, - parallel=parallel, - verbose=verbose, - densmap=densmap, - densmap_kwds=densmap_kwds, - ) + if pin_mask is not None: + embedding = optimize_layout_euclidean_masked( + embedding, + embedding, + head, + tail, + pin_mask, + n_epochs, + n_vertices, + epochs_per_sample, + a, + b, + rng_state, + gamma, + initial_alpha, + negative_sample_rate, + parallel=parallel, + verbose=verbose, + densmap=densmap, + densmap_kwds=densmap_kwds, + ) + else: + embedding = optimize_layout_euclidean( + embedding, + embedding, + head, + tail, + n_epochs, + n_vertices, + epochs_per_sample, + a, + b, + rng_state, + gamma, + initial_alpha, + negative_sample_rate, + parallel=parallel, + verbose=verbose, + densmap=densmap, + densmap_kwds=densmap_kwds, + ) else: embedding = optimize_layout_generic( embedding, @@ -1997,6 +2030,7 @@ def __mul__(self, other): np.mean(result._a), np.mean(result._b), np.mean(result.repulsion_strength), + None, np.mean(result.negative_sample_rate), n_epochs, init, @@ -2066,6 +2100,7 @@ def __add__(self, other): np.mean(result._a), np.mean(result._b), np.mean(result.repulsion_strength), + None, np.mean(result.negative_sample_rate), n_epochs, init, @@ -2137,6 +2172,7 @@ def __sub__(self, other): np.mean(result._a), np.mean(result._b), np.mean(result.repulsion_strength), + None, np.mean(result.negative_sample_rate), n_epochs, init, @@ -2156,7 +2192,7 @@ def __sub__(self, other): return result - def fit(self, X, y=None): + def fit(self, X, y=None, pin_mask=None): """Fit X into an embedded space. Optionally use y for supervised dimension reduction. @@ -2174,6 +2210,15 @@ def fit(self, X, y=None): handled is determined by parameters UMAP was instantiated with. The relevant attributes are ``target_metric`` and ``target_metric_kwds``. + + pin_mask : array, shape (n_samples) or None + A mask used for pinning points in the embedding. It should be an array + of weights in [0,1] (one weight per point), defining how much points + will be updated from their initial position: 0 means the point will be + pinned (fixed), 1 means it will be updated normally, and in-between + values allow for soft-pinning. This argument is useful when providing a + numpy array for the initial embedding positions (``init`` parameter of + the ``UMAP`` class). """ X = check_array(X, dtype=np.float32, accept_sparse="csr", order="C") @@ -2584,6 +2629,7 @@ def fit(self, X, y=None): self.n_epochs, init, random_state, # JH why raw data? + pin_mask ) # Assign any points that are fully disconnected from our manifold(s) to have embedding # coordinates of np.nan. These will be filtered by our plotting functions automatically. @@ -2608,7 +2654,7 @@ def fit(self, X, y=None): return self - def _fit_embed_data(self, X, n_epochs, init, random_state): + def _fit_embed_data(self, X, n_epochs, init, random_state, pin_mask): """A method wrapper for simplicial_set_embedding that can be replaced by subclasses. """ @@ -2620,6 +2666,7 @@ def _fit_embed_data(self, X, n_epochs, init, random_state): self._a, self._b, self.repulsion_strength, + pin_mask, self.negative_sample_rate, n_epochs, init, @@ -2636,7 +2683,7 @@ def _fit_embed_data(self, X, n_epochs, init, random_state): self.verbose, ) - def fit_transform(self, X, y=None): + def fit_transform(self, X, y=None, pin_mask=None): """Fit X into an embedded space and return that transformed output. @@ -2652,6 +2699,15 @@ def fit_transform(self, X, y=None): The relevant attributes are ``target_metric`` and ``target_metric_kwds``. + pin_mask : array, shape (n_samples) or None + A mask used for pinning points in the embedding. It should be an array + of weights in [0,1] (one weight per point), defining how much points + will be updated from their initial position: 0 means the point will be + pinned (fixed), 1 means it will be updated normally, and in-between + values allow for soft-pinning. This argument is useful when providing a + numpy array for the initial embedding positions (``init`` parameter of + the ``UMAP`` class). + Returns ------- X_new : array, shape (n_samples, n_components) @@ -2666,7 +2722,7 @@ def fit_transform(self, X, y=None): r_emb: array, shape (n_samples) Local radii of data points in the embedding (log-transformed). """ - self.fit(X, y) + self.fit(X, y, pin_mask) if self.transform_mode == "embedding": if self.output_dens: return self.embedding_, self.rad_orig_, self.rad_emb_ @@ -3161,6 +3217,7 @@ def update(self, X): self._a, self._b, self.repulsion_strength, + None, self.negative_sample_rate, n_epochs, init, @@ -3226,6 +3283,7 @@ def update(self, X): self._a, self._b, self.repulsion_strength, + None, self.negative_sample_rate, n_epochs, init,