Some kind of documentation

ge25duq · ge25duq · commit 172bf221feab · 2022-11-17T16:52:11.000+01:00
diff --git a/use_cases/hand_written_digits.py b/use_cases/hand_written_digits.py
@@ -14,10 +14,12 @@
 from datafold.pcfold.kernels import PCManifoldKernel
 
 import full_matrix
-from plot_embedding import plot_embedding
+from utils import plot_embedding
+from utils import sort_eigen_pairs
 
 # Source code taken and adapted from https://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html
 
+"""Input variables required for the instantiation of FullMatrix"""
 executable = "./test_gofmm"
 problem_size = 1024
 max_leaf_node_size = 512
@@ -30,19 +32,17 @@
 matrix_type = "dense"
 kernel_type = "gaussian"
 
+"""Loading the hand written digits from sklearn"""
 digits = datasets.load_digits(n_class=6)
 X = digits.data[0:problem_size,:]
 y = digits.target[0:problem_size]
 images = digits.images[0:problem_size,:,:]
-print("X",X)
-print("images",images)
-print("images shape",images.shape)
-print("target",y)
 
 X_train, X_test, y_train, y_test, images_train, images_test = train_test_split(
     X, y, images, train_size=2 / 3, test_size=1 / 3
 )
 
+""""Instantiation of point cloud data and find the manifold using DiffusionMaps"""
 X_pcm = pfold.PCManifold(X)
 X_pcm.optimize_parameters(result_scaling=2)
 
@@ -59,38 +59,22 @@
 dmap = dmap.set_target_coords([1, 2])
 X_dmap = dmap.transform(X_pcm)
 
-# Mapping of diffusion maps
-plot_embedding(
-    X_dmap,
-    y,
-    images,
-    title="Diffusion map embedding of the digits (time %.2fs)" % (time.time() - t0),
-)
-
 dmap = DiffusionMaps(
     kernel=pfold.GaussianKernel(epsilon=X_pcm.kernel.epsilon),
     n_eigenpairs=6,
     dist_kwargs=dict(cut_off=X_pcm.cut_off),
 )
 dmap = dmap.fit(X_pcm)
-plot_pairwise_eigenvector(
-    eigenvectors=dmap.eigenvectors_[:, 1:],
-    n=0,
-    idx_start=1,
-    fig_params=dict(figsize=(10, 10)),
-    scatter_params=dict(c=y),
-)
-
-plt.savefig('hr_digits_dmap.png')
 
+"""Compute the same kernel matrix with the same optimized datafold parameters, for the instantiation of FullMatrix"""
 pcm = pfold.PCManifold(X, 
                         kernel=pfold.DmapKernelFixed(internal_kernel=pfold.GaussianKernel(epsilon=378.0533464967807), is_stochastic=True, alpha=1, symmetrize_kernel=True),
                         dist_kwargs=dict(cut_off=83.45058418010026, kmin=0, backend= "guess_optimal"))
 
 kernel_output = pcm.compute_kernel_matrix()
 ( kernel_matrix, cdist_kwargs, ret_extra, ) = PCManifoldKernel.read_kernel_output(kernel_output=kernel_output)
 
-
+"""Convert the kernel matrix to dense matrix type"""
 kernel_matrix_sparse = kernel_matrix.copy()
 kernel_matrix_sparse = kernel_matrix_sparse.asfptype()
 kernel_matrix = kernel_matrix.todense()
@@ -99,11 +83,10 @@
 weights = np.ones((problem_size, num_rhs))      
 
 
+"""Instantiation of FullMatrix"""
 kernel_matrix_OP = full_matrix.FullMatrix( executable, problem_size, max_leaf_node_size,
                             num_of_neighbors, max_off_diagonal_ranks, num_rhs, user_tolerance, computation_budget,
                             distance_type, matrix_type, kernel_type, kernel_matrix, weights, dtype=np.float32 )
-print("weights shape",weights.shape)
-print("K shape",kernel_matrix.shape)
 
 n_eigenpairs = 6
 solver_kwargs = {
@@ -116,53 +99,58 @@
 }
 
 basis_change_matrix = ret_extra['basis_change_matrix']
-inv_basis_change_matrix = scipy.sparse.diags(np.reciprocal(basis_change_matrix.data.ravel()))
 
 evals_all, evecs_all = scipy.sparse.linalg.eigsh(kernel_matrix_sparse, **solver_kwargs)
 evals_large, evecs_large = scipy.sparse.linalg.eigsh(kernel_matrix_OP, **solver_kwargs)
 
-sort_scipy = np.argsort( evals_all )
-sort_scipy = sort_scipy[::-1]
-sorted_scipy_evals = evals_all[sort_scipy]
-sorted_scipy_evecs = evecs_all[:,sort_scipy]
-
-sort_gofmm = np.argsort( evals_large )
-sort_gofmm = sort_gofmm[::-1]
-sorted_gofmm_evals = evals_large[sort_gofmm]
-sorted_gofmm_evecs = evecs_large[:,sort_gofmm]
-
-sorted_gofmm_evecs = basis_change_matrix @ sorted_gofmm_evecs
-sorted_gofmm_evecs /= np.linalg.norm(sorted_gofmm_evecs, axis=0)[np.newaxis, :]
-
-sorted_scipy_evecs = basis_change_matrix @ sorted_scipy_evecs
-sorted_scipy_evecs /= np.linalg.norm(sorted_scipy_evecs, axis=0)[np.newaxis, :]
+sort_eigen_pairs( evals_all, evecs_all, basis_change_matrix )
+sort_eigen_pairs( evals_large, evecs_large, basis_change_matrix )
 
+"""Print eigen pairs and plot hand written digits, eigen vector comparisons"""
 print("eigenvalues of gofmm")
-print(sorted_gofmm_evals)
+print(evals_large)
 print("eigenvectors of gofmm sorted")
-print(sorted_gofmm_evecs)
+print(evecs_large)
 print("eigenvalues of scipy")
-print(sorted_scipy_evals)
+print(evals_all)
 print("eigenvectors of scipy")
-print(sorted_scipy_evecs)
+print(evecs_all)
 print("eigenvalues of datafold")
 print(dmap.eigenvalues_)
 print("eigenvectors of datafold")
 print(dmap.eigenvectors_)
 
 plot_pairwise_eigenvector(
-    eigenvectors=sorted_scipy_evecs[:, 1:],
+    eigenvectors=evecs_all[:, 1:],
     n=0,
     idx_start=1,
     fig_params=dict(figsize=(10, 10)),
     scatter_params=dict(c=y),
 )
 plt.savefig('hr_digits_scipy.png')
 plot_pairwise_eigenvector(
-    eigenvectors=sorted_gofmm_evecs[:, 1:],
+    eigenvectors=evecs_large[:, 1:],
     n=0,
     idx_start=1,
     fig_params=dict(figsize=(10, 10)),
     scatter_params=dict(c=y),
 )
 plt.savefig('hr_digits_gofmm.png')
+
+plot_pairwise_eigenvector(
+    eigenvectors=dmap.eigenvectors_[:, 1:],
+    n=0,
+    idx_start=1,
+    fig_params=dict(figsize=(10, 10)),
+    scatter_params=dict(c=y),
+)
+plt.savefig('hr_digits_dmap.png')
+
+plot_embedding(
+    X_dmap,
+    y,
+    images,
+    title="Diffusion map embedding of the digits (time %.2fs)" % (time.time() - t0),
+)
+
+
diff --git a/use_cases/mnist.py b/use_cases/mnist.py
@@ -20,8 +20,10 @@
 from tensorflow.keras import layers
 
 import full_matrix
-from plot_embedding import plot_embedding
+from utils import plot_embedding
+from utils import sort_eigen_pairs
 
+"""Input variables required for the instantiation of FullMatrix"""
 executable = "./test_gofmm"
 problem_size = 1024
 max_leaf_node_size = 512
@@ -34,6 +36,7 @@
 matrix_type = "dense"
 kernel_type = "gaussian"
 
+"""Loading the hand written digits from MNISt"""
 num_classes = 10
 input_shape = (28, 28, 1)
 
@@ -46,7 +49,7 @@
 X = images.reshape(problem_size, 784)
 
 
-#DATAFOLD stuff
+""""Instantiation of point cloud data and find the manifold using DiffusionMaps"""
 X_pcm = pfold.PCManifold(X)
 X_pcm.optimize_parameters(result_scaling=2)
 
@@ -63,51 +66,33 @@
 dmap = dmap.set_target_coords([1, 2])
 X_dmap = dmap.transform(X_pcm)
 
-# Mapping of diffusion maps
-plot_embedding(
-    X_dmap,
-    y,
-    images,
-    title="Diffusion map embedding of the digits (time %.2fs)" % (time.time() - t0),
-)
-
 dmap = DiffusionMaps(
     kernel=pfold.GaussianKernel(epsilon=X_pcm.kernel.epsilon),
     n_eigenpairs=6,
     dist_kwargs=dict(cut_off=X_pcm.cut_off),
 )
 dmap = dmap.fit(X_pcm)
-plot_pairwise_eigenvector(
-    eigenvectors=dmap.eigenvectors_[:, 1:],
-    n=0,
-    idx_start=1,
-    fig_params=dict(figsize=(10, 10)),
-    scatter_params=dict(c=y),
-)
-
-plt.savefig('mnist_digits_dmap.png')
 
+"""Compute the same kernel matrix with the same optimized datafold parameters, for the instantiation of FullMatrix"""
 pcm = pfold.PCManifold(X, 
                         kernel=pfold.DmapKernelFixed(internal_kernel=pfold.GaussianKernel(epsilon=24.44322087308319), is_stochastic=True, alpha=1, symmetrize_kernel=True),
                         dist_kwargs=dict(cut_off=21.219348907470703, kmin=0, backend= "guess_optimal"))
 
 kernel_output = pcm.compute_kernel_matrix()
 ( kernel_matrix, cdist_kwargs, ret_extra, ) = PCManifoldKernel.read_kernel_output(kernel_output=kernel_output)
 
-
+"""Convert the kernel matrix to dense matrix type"""
 kernel_matrix_sparse = kernel_matrix.copy()
 kernel_matrix_sparse = kernel_matrix_sparse.asfptype()
 kernel_matrix = kernel_matrix.todense()
 kernel_matrix = kernel_matrix.astype("float32")
 #kernel_matrix.tofile("KernelMatrix_32768.bin")
 weights = np.ones((problem_size, num_rhs))      
 
-#GOFMM stuff
+"""Instantiation of FullMatrix"""
 kernel_matrix_OP = full_matrix.FullMatrix( executable, problem_size, max_leaf_node_size,
                             num_of_neighbors, max_off_diagonal_ranks, num_rhs, user_tolerance, computation_budget,
                             distance_type, matrix_type, kernel_type, kernel_matrix, weights, dtype=np.float32 )
-print("weights shape",weights.shape)
-print("K shape",kernel_matrix.shape)
 
 n_eigenpairs = 6
 solver_kwargs = {
@@ -120,35 +105,22 @@
 }
 
 basis_change_matrix = ret_extra['basis_change_matrix']
-inv_basis_change_matrix = scipy.sparse.diags(np.reciprocal(basis_change_matrix.data.ravel()))
 
 evals_all, evecs_all = scipy.sparse.linalg.eigsh(kernel_matrix_sparse, **solver_kwargs)
 evals_large, evecs_large = scipy.sparse.linalg.eigsh(kernel_matrix_OP, **solver_kwargs)
 
-sort_scipy = np.argsort( evals_all )
-sort_scipy = sort_scipy[::-1]
-sorted_scipy_evals = evals_all[sort_scipy]
-sorted_scipy_evecs = evecs_all[:,sort_scipy]
-
-sort_gofmm = np.argsort( evals_large )
-sort_gofmm = sort_gofmm[::-1]
-sorted_gofmm_evals = evals_large[sort_gofmm]
-sorted_gofmm_evecs = evecs_large[:,sort_gofmm]
-
-sorted_gofmm_evecs = basis_change_matrix @ sorted_gofmm_evecs
-sorted_gofmm_evecs /= np.linalg.norm(sorted_gofmm_evecs, axis=0)[np.newaxis, :]
-
-sorted_scipy_evecs = basis_change_matrix @ sorted_scipy_evecs
-sorted_scipy_evecs /= np.linalg.norm(sorted_scipy_evecs, axis=0)[np.newaxis, :]
+sort_eigen_pairs( evals_all, evecs_all, basis_change_matrix )
+sort_eigen_pairs( evals_large, evecs_large, basis_change_matrix )
 
+"""Print eigen pairs and plot hand written digits, eigen vector comparisons"""
 print("eigenvalues of gofmm")
-print(sorted_gofmm_evals)
+print(evecs_large)
 print("eigenvectors of gofmm sorted")
-print(sorted_gofmm_evecs)
+print(evecs_large)
 print("eigenvalues of scipy")
-print(sorted_scipy_evals)
+print(evals_all)
 print("eigenvectors of scipy")
-print(sorted_scipy_evecs)
+print(evals_all)
 print("eigenvalues of datafold")
 print(dmap.eigenvalues_)
 print("eigenvectors of datafold")
@@ -170,3 +142,20 @@
     scatter_params=dict(c=y),
 )
 plt.savefig('mnist_digits_gofmm.png')
+
+plot_pairwise_eigenvector(
+    eigenvectors=dmap.eigenvectors_[:, 1:],
+    n=0,
+    idx_start=1,
+    fig_params=dict(figsize=(10, 10)),
+    scatter_params=dict(c=y),
+)
+
+plt.savefig('mnist_digits_dmap.png')
+
+plot_embedding(
+    X_dmap,
+    y,
+    images,
+    title="Diffusion map embedding of the digits (time %.2fs)" % (time.time() - t0),
+)
diff --git a/use_cases/utils.py b/use_cases/utils.py
@@ -3,7 +3,16 @@
 from matplotlib import image, offsetbox
 
 def plot_embedding(X, y, digits, title=None):
-    """Scale and visualize the embedding vectors"""
+    """Scale and visualize the embedding vectors.
+
+    X
+        Data set of images of shape ( problem_size, 784)
+    y
+        Target labels of shape ( problem_size, )
+    title
+        Title of the plot
+    
+    """
     x_min, x_max = np.min(X, 0), np.max(X, 0)
     X = (X - x_min) / (x_max - x_min)
 
@@ -37,3 +46,25 @@ def plot_embedding(X, y, digits, title=None):
     if title is not None:
         plt.title(title)
     plt.savefig("digits.png")
+
+def sort_eigen_pairs( evals, evecs, basis_change_matrix ):
+    """Sorts eigen values in descending order and
+    orders eigen vectors in the corresponding order.
+    Also does math similar to datafold methods.
+
+    evals
+        Eigen values to be sorted
+    evecs
+        Eigen vectors to be sorted
+    basis_change_matrix
+        The changed basis obtained from datafold
+    
+    """
+    sort_order = np.argsort( evals )
+    sort_order = sort_order[::-1]
+    evals[:] = evals[sort_order]
+    evecs[:,:] = evecs[:,sort_order]
+
+    evecs[:,:] = basis_change_matrix @ evecs
+    evecs[:,:] /= np.linalg.norm(evecs, axis=0)[np.newaxis, :]
+