Update dosc src and readme

andyzhangstat · May 21, 2024 · 7fd5fc5 · 7fd5fc5
1 parent 9b288a9
commit 7fd5fc5
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 167 deletions.
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ There are two major functions in this package:
 
 `sparse_logistic_svd_coord_2_way(dat, lambdas=np.logspace(-2, 2, num=10), etas=np.logspace(-2, 2, num=10), k=2, quiet=True,
                            max_iters=100, conv_crit=1e-5, randstart=False,
-                           normalize=False, start_A=None, start_B=None, start_mu=None)`: This function performs Twao-way Sparse Logistic Singular Value Decomposition (SLSVD) using Majorization-Minimization and Coordinate Descent algorithms. 
+                           normalize=False, start_A=None, start_B=None, start_mu=None)`: This function performs Two-way Sparse Logistic Singular Value Decomposition (SLSVD) using Majorization-Minimization and Coordinate Descent algorithms. 
 
 
 
@@ -44,6 +44,7 @@ There are two major functions in this package:
 - `random_seed` (integer): Random seed to ensure reproducibility.
 - `dat`: Input data matrix.
 - `lambdas`: Array of regularization parameters.
+- `etas`: Array of regularization parameters.
 - `k`: Number of components.
 - `quiet`: Boolean to suppress iteration printouts.
 - `max_iters`: Maximum number of iterations.
@@ -118,15 +119,15 @@ poetry run pytest --cov-branch --cov=SLSVD2 --cov-report html
 
 ## Usage
 
-Use this package to find the optimized score and loading matrices of sparse logistic Singular Value Decomposition. In the following example, we generate a simulated data set with defined size first. By the Majorization-Minimization and Coordinate Descent algorithms, we obtain the optimized score and loading matrices. Finally, we visualize both the simulated data and fitted loadings in one figure.
+Use this package to find the optimized score and loading matrices of two-way sparse logistic Singular Value Decomposition. In the following example, we generate a simulated data set with defined size first. By the Majorization-Minimization and Coordinate Descent algorithms, we obtain the optimized score and loading matrices. Finally, we visualize both the simulated data and fitted loadings in one figure.
 
 Example usage:
 
 ```python
 >>> from slsvd.data_generation import generate_data
 >>> import numpy as np
 >>> import matplotlib.pyplot as plt
->>> bin_mat, loadings, scores, diagonal=generate_data(n=200, d=100, rank=2, random_seed=123)
+>>> bin_mat, loadings, scores, diagonal=generate_data_2_way(n=200, d=100, rank=2, random_seed=123)
 
 # Check shapes
 >>> print("Binary Matrix Shape:", bin_mat.shape)
@@ -151,8 +152,8 @@ Loadings Shape: (100, 2)
 Scores Shape: (200, 2)
 
 Dot Product of Scores:
-array([[195.4146256 ,   2.67535881],
-       [  2.67535881, 200.14653178]])
+array([[1., 0.],
+       [0., 1.]])
 
 Dot Product of Loadings:
 array([[1., 0.],
@@ -162,19 +163,20 @@ array([[1., 0.],
 
 
 ```python
->>> plt.figure(figsize=(8, 12))
->>> cmap = plt.cm.get_cmap('viridis', 2)
-
+>>> plt.figure(figsize=(6, 9)) 
+>>> colors = ['cyan', 'magenta']
+>>> cmap = plt.matplotlib.colors.ListedColormap(colors, name='custom_cmap', N=2)
 >>> plt.imshow(bin_mat, cmap=cmap, interpolation='nearest')
-
 >>> cbar = plt.colorbar(ticks=[0.25, 0.75])
 >>> cbar.ax.set_yticklabels(['0', '1'])
-
->>> plt.title('Heatmap of Binary Matrix')
+>>> plt.title('Heatmap of Simulated Binary Matrix')
 >>> plt.xlabel('Feature')
 >>> plt.ylabel('Sample')
 
+>>> plt.tight_layout()
+
 >>> plt.show()
+
 ```
 
 
@@ -187,7 +189,7 @@ array([[1., 0.],
 >>> import numpy as np
 
 >>> # Perform Sparse Logistic SVD
->>> mu, A, B, zeros, BICs = sparse_logistic_svd_coord(bin_mat, lambdas=np.logspace(-2, 1, num=10), k=2)
+>>> mu, A, B, S, zeros, BICs = sparse_logistic_svd_coord_2_way(bin_mat, lambdas=np.logspace(-2, 1, num=10), etas=np.logspace(-2, 1, num=10), k=2)
 
 >>> # Calculate mean of mu
 >>> print("Mean of mu:", np.mean(mu))
@@ -203,15 +205,15 @@ array([[1., 0.],
 
 
 ```
-Mean of mu: 0.052624279581212116
+Mean of mu: 0.07933574417007386
 
 Dot Product of Scores:
-array([[7672.61634966,  277.23466856],
-       [ 277.23466856, 3986.24113586]])
+array([[1.        , 0.02601576],
+       [0.02601576, 1.        ]])
 
 Dot Product of Loadings:
-array([[1.        , 0.00111067],
-       [0.00111067, 1.        ]])
+array([[1.        , 0.03334437],
+       [0.03334437, 1.        ]])
 
 ```
 
@@ -221,8 +223,7 @@ array([[1.        , 0.00111067],
 ## Documentations
 
 
-Online documentation is available [readthedocs](https://slsvd.readthedocs.io/en/latest/?badge=latest).
-
+Online documentation is available [readthedocs](https://slsvd2.readthedocs.io/en/latest/?badge=latest).
 Publishing on [TestPyPi](https://test.pypi.org/project/slsvd2/) and [PyPi](https://pypi.org/project/slsvd2/). 
 
 ## Contributors

diff --git a/docs/example.ipynb b/docs/example.ipynb
@@ -235,9 +235,8 @@
     }
    ],
    "source": [
-    "plt.figure(figsize=(6, 9))  # Adjust the width and height as needed\n",
+    "plt.figure(figsize=(6, 9)) \n",
     "\n",
-    "# Create a custom colormap using cyan and magenta\n",
     "colors = ['cyan', 'magenta']\n",
     "cmap = plt.matplotlib.colors.ListedColormap(colors, name='custom_cmap', N=2)\n",
     "\n",
@@ -250,11 +249,8 @@
     "plt.xlabel('Feature')\n",
     "plt.ylabel('Sample')\n",
     "\n",
-    "# Use tight layout to reduce white space\n",
     "plt.tight_layout()\n",
     "\n",
-    "#plt.savefig('heatmap.png', format='png', dpi=300)\n",
-    "# Show the plot\n",
     "plt.show()\n"
    ]
   },
@@ -389,7 +385,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "mu, A, B,S, zeros, BICs=sparse_logistic_svd_coord_2_way(dat=bin_mat, lambdas=thelam, etas=theeta, k=2) "
+    "mu, A, B, S, zeros, BICs=sparse_logistic_svd_coord_2_way(dat=bin_mat, lambdas=thelam, etas=theeta, k=2) "
    ]
   },
   {

diff --git a/src/slsvd2/slsvd2.py b/src/slsvd2/slsvd2.py
@@ -25,148 +25,6 @@ def inv_logit_mat(x, min_val=0, max_val=1):
 
 
 
-# def sparse_logistic_svd_coord_2_way(dat, lambdas=np.logspace(-2, 2, num=10), etas=np.logspace(-2, 2, num=10), k=2, quiet=True,
-#                                     max_iters=100, conv_crit=1e-5, randstart=False,
-#                                     normalize=False, start_A=None, start_B=None, start_mu=None):
-#     """
-#     Sparse Logistic SVD biclustering with Coordinate Descent.
-
-#     Parameters:
-#     dat : ndarray
-#         Input data matrix.
-#     lambdas : array_like, optional
-#         Array of regularization parameters.
-#     etas : array_like, optional
-#         Array of regularization parameters.
-#     k : int, optional
-#         Number of components.
-#     quiet : bool, optional
-#         If True, suppresses iteration printouts.
-#     max_iters : int, optional
-#         Maximum number of iterations.
-#     conv_crit : float, optional
-#         Convergence criterion.
-#     randstart : bool, optional
-#         If True, uses random initialization.
-#     normalize : bool, optional
-#         If True, normalizes the components.
-#     start_A : ndarray, optional
-#         Initial value for matrix A.
-#     start_B : ndarray, optional
-#         Initial value for matrix B.
-#     start_mu : ndarray, optional
-#         Initial value for mean vector.
-
-#     Returns:
-#     tuple
-#         Tuple containing mu, A, B, zeros_mat, BICs.
-#         - mu: The mean vector.
-#         - A: The matrix A.
-#         - B: The matrix B.
-#         - zeros_mat: Matrix indicating the number of zeros in each component.
-#         - BICs: Matrix containing the Bayesian Information Criterion for each component.
-#     """
-
-#     q = 2 * dat - 1
-#     q[np.isnan(q)] = 0
-
-#     n, d = dat.shape
-
-#     if not randstart:
-#         mu = np.mean(q)
-#         udv = svd((q - np.mean(q)).T, full_matrices=False)
-#         B = udv[0][:, :k]
-#         A = udv[2][:k, :].T
-#         S = np.diag(udv[1][:k])
-#     else:
-#         mu = np.random.randn(d)
-#         A = np.random.uniform(-1, 1, size=(n, k))
-#         B = np.random.uniform(-1, 1, size=(d, k))
-#         S = np.diag(np.ones(k))
-
-#     if start_B is not None:
-#         B = start_B / np.sqrt(np.sum(start_B**2, axis=0))
-
-#     if start_A is not None:
-#         A = start_A / np.sqrt(np.sum(start_A**2, axis=0))
-
-#     if start_mu is not None:
-#         mu = start_mu
-
-#     BICs = np.zeros((len(lambdas) * len(etas), k))
-#     zeros_mat = np.zeros((len(lambdas) * len(etas), k))
-#     iters = np.zeros((len(lambdas) * len(etas), k))
-
-#     theta = mu + (A @ S @ B.T)
-
-#     X = theta + 4 * q * (1 - inv_logit_mat(q * theta))
-#     Xcross = X - (A @ S @ B.T)
-#     mu = np.mean(Xcross)
-
-#     for m in range(k):
-#         last_A = A.copy()
-#         last_B = B.copy()
-
-#         theta = mu + (A @ S @ B.T)
-#         X = theta + 4 * q * (1 - inv_logit_mat(q * theta))
-#         Xm = X - (mu + A[:, np.arange(k) != m] @ np.diag(S[np.arange(k) != m, np.arange(k) != m]) @ B[:, np.arange(k) != m].T)
-
-#         idx = 0
-#         for lambda_val in lambdas:
-#             for eta_val in etas:
-#                 for i in range(max_iters):
-#                     if np.sum(B[:, m]**2) == 0:
-#                         A[:, m] = Xm @ B[:, m]
-#                         break
-#                     if np.sum(A[:, m]**2) == 0:
-#                         B[:, m] = Xm.T @ A[:, m]
-#                         break
-
-#                     A_lse = Xm @ B[:, m]
-#                     A[:, m] = np.sign(A_lse) * np.maximum(0, np.abs(A_lse) - eta_val)
-#                     S[m,m] = np.sqrt(np.sum(A[:, m]**2))
-#                     A[:, m] = A[:, m] / np.sqrt(np.sum(A[:, m]**2))
-
-#                     B_lse = Xm.T @ A[:, m]
-#                     B[:, m] = np.sign(B_lse) * np.maximum(0, np.abs(B_lse) - lambda_val)
-#                     S[m,m] = np.sqrt(np.sum(B[:, m]**2))
-#                     B[:, m] = B[:, m] / np.sqrt(np.sum(B[:, m]**2))
-
-#                     loglike = np.sum(np.log(inv_logit_mat(q * (mu + (A @ S @ B.T))))[~np.isnan(dat)])
-#                     penalty = 0.25 * lambda_val * np.sum(np.abs(B[:, m])) + 0.25 * eta_val * np.sum(np.abs(A[:, m]))
-#                     cur_loss = (-loglike + penalty) / np.sum(~np.isnan(dat))
-
-#                     if not quiet:
-#                         print(m, "  ", np.round(-loglike, 4), "   ", np.round(penalty, 4),
-#                               "     ", np.round(-loglike + penalty, 4))
-
-#                     if i > 4:
-#                         if (last_loss - cur_loss) / last_loss < conv_crit:
-#                             break
-
-#                     last_loss = cur_loss
-
-#                 BICs[idx, m] = -2 * loglike + np.log(n * d) * (1 + np.count_nonzero(B[:, m]) + np.count_nonzero(A[:, m]))
-#                 zeros_mat[idx, m] = np.count_nonzero(B[:, m]) + np.count_nonzero(A[:, m])
-#                 iters[idx, m] = i
-#                 idx += 1
-
-#         best_idx = np.argmin(BICs[:, m])
-#         best_lambda_idx, best_eta_idx = divmod(best_idx, len(etas))
-#         B[:, m] = B[:, best_lambda_idx * len(etas) + best_eta_idx]
-#         A[:, m] = A[:, best_lambda_idx * len(etas) + best_eta_idx]
-
-#     if normalize:
-#         A = A / np.sqrt(np.sum(A**2, axis=0))
-#         B = B / np.sqrt(np.sum(B**2, axis=0))
-
-#     return mu, A, B, S, zeros_mat, BICs
-
-
-
-
-
-
 def sparse_logistic_svd_coord_2_way(dat, lambdas=np.logspace(-2, 2, num=10), etas=np.logspace(-2, 2, num=10), k=2, quiet=True,
                                     max_iters=100, conv_crit=1e-5, randstart=False,
                                     normalize=False, start_A=None, start_B=None, start_mu=None):