Speed up tests (#56)

* not parametrizing on gpu_verbose; always testing on use_gpu=True ; if TorchBackend works for ot, lets just use that backend; testing on fewer iteration wherever we can; minor refactorings for test speedup; pytest-xdist for parallel execution; correct patching of a test function; not using parallel pytest for coverage generation; removed custom runner now that tests are relatively fast; reduced iterations; decomposed tests to test dissimilarity_metric separately
raphael-group · Oct 6, 2024 · 67a751e · 67a751e
1 parent 1c229d9
commit 67a751e
Show file tree

Hide file tree

Showing 31 changed files with 5,050 additions and 24,282 deletions.
diff --git a/.github/workflows/test_pinned_deps.yml b/.github/workflows/test_pinned_deps.yml
@@ -36,15 +36,21 @@ jobs:
       - name: Install package
         # Note: editable install for the coverage step to pick up source
         # correctly. (coverage run --source=src/paste3 -m pytest)
-        run: pip install -e .
+        run: pip install -e . --no-deps
 
       - name: Pre-commit checks
         run: pre-commit run --all-files
 
-      - name: Check Versions of Dependencies
+      - name: List dependencies
         run: pip freeze
 
+      - name: Pytest
+        if: matrix.os != 'ubuntu-latest'
+        run: pytest -n auto
+
       - name: Pytest with coverage
+        if: matrix.os == 'ubuntu-latest'
+        # Note: Use of pytest -n .. (pytest-xdist) does not work with coverage
         run: coverage run --source=src/paste3 -m pytest
 
       - name: Upload coverage to Coveralls

diff --git a/.github/workflows/test_pinned_deps_gpu.yml b/.github/workflows/test_pinned_deps_gpu.yml
diff --git a/.github/workflows/test_unpinned_deps.yml b/.github/workflows/test_unpinned_deps.yml
@@ -33,8 +33,8 @@ jobs:
       - name: Install Developer Dependencies
         run: pip install .[dev]
 
-      - name: Check Versions of Dependencies
+      - name: List dependencies
         run: pip freeze
 
       - name: Pytest
-        run: pytest
+        run: pytest -n auto
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,6 +43,7 @@ addopts = "-sv"
 dev = [
     "build",
     "pytest",
+    "pytest-xdist",
     "coverage",
     "coveralls",
     "ruff",

diff --git a/requirements.txt b/requirements.txt
@@ -1,24 +1,85 @@
-anndata==0.10.8
-scanpy==1.10.2
-POT==0.9.4
-numpy==1.26.4
-scipy==1.14.1
-scikit-learn==1.5.1
-IPython==8.27.0
-pytest==8.3.2
-seaborn==0.13.2
-matplotlib==3.9.2
-pandas==2.2.2
-statsmodels==0.14.2
-networkx==3.3
-sphinx==7.4.7
-myst-parser==4.0.0
-sphinx-autodoc-typehints==2.2.3
-nbsphinx==0.9.5
-sphinx-gallery==0.17.1
-sphinx-rtd-theme==2.0.0
+anndata==0.10.9
+array_api_compat==1.9
+asttokens==2.4.1
+build==1.2.2
+certifi==2024.8.30
+cfgv==3.4.0
+charset-normalizer==3.3.2
+contourpy==1.3.0
 coverage==7.6.1
 coveralls==4.0.1
-ruff==0.6.6
+cycler==0.12.1
+decorator==5.1.1
+distlib==0.3.8
+docopt==0.6.2
+execnet==2.1.1
+executing==2.1.0
+filelock==3.16.1
+fonttools==4.54.1
+fsspec==2024.9.0
+h5py==3.12.1
+identify==2.6.1
+idna==3.10
+iniconfig==2.0.0
+ipython==8.28.0
+jedi==0.19.1
+Jinja2==3.1.4
+joblib==1.4.2
+kiwisolver==1.4.7
+legacy-api-wrap==1.4
+llvmlite==0.43.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+mpmath==1.3.0
+natsort==8.4.0
+networkx==3.3
+nodeenv==1.9.1
+numba==0.60.0
+numpy==1.26.4
+packaging==24.1
+pandas==2.2.3
+parso==0.8.4
+patsy==0.5.6
+pexpect==4.9.0
+pillow==10.4.0
+platformdirs==4.3.6
+pluggy==1.5.0
+POT==0.9.4
 pre-commit==3.8.0
+prompt_toolkit==3.0.48
+ptyprocess==0.7.0
+pure_eval==0.2.3
+Pygments==2.18.0
+pynndescent==0.5.13
+pyparsing==3.1.4
+pyproject_hooks==1.2.0
+pytest==8.3.3
+pytest-xdist==3.6.1
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+requests==2.32.3
+ruff==0.6.8
+scanpy==1.10.3
+scikit-learn==1.5.2
+scipy==1.14.1
+seaborn==0.13.2
+session_info==1.0.0
+setuptools==75.1.0
+six==1.16.0
+stack-data==0.6.3
+statsmodels==0.14.4
+stdlib-list==0.10.0
+sympy==1.13.3
+threadpoolctl==3.5.0
 torch==2.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+typing_extensions==4.12.2
+tzdata==2024.2
+umap-learn==0.5.6
+urllib3==2.2.3
+virtualenv==20.26.6
+wcwidth==0.2.13
+wheel==0.44.0
diff --git a/src/paste3/helper.py b/src/paste3/helper.py
@@ -50,12 +50,25 @@ def generalized_kl_divergence(X, Y):
     return np.asarray(D)
 
 
-def glmpca_distance(X, Y, latent_dim=50, filter=True, verbose=True):
+def glmpca_distance(
+    X,
+    Y,
+    latent_dim=50,
+    filter=True,
+    verbose=True,
+    maxIter=1000,
+    eps=1e-4,
+    optimizeTheta=True,
+):
     """
     param: X - np array with dim (n_samples by n_features)
     param: Y - np array with dim (m_samples by n_features)
     param: latent_dim - number of latent dimensions in glm-pca
     param: filter - whether to first select genes with highest UMI counts
+    param: verbose - whether to print glmpca progress
+    param maxIter - maximum number of iterations for glmpca
+    param eps - convergence threshold for glmpca
+    param optimizeTheta - whether to optimize overdispersion in glmpca
     """
     assert X.shape[1] == Y.shape[1], "X and Y do not have the same number of features."
 
@@ -66,7 +79,13 @@ def glmpca_distance(X, Y, latent_dim=50, filter=True, verbose=True):
         joint_matrix = joint_matrix[:, top_indices]
 
     print("Starting GLM-PCA...")
-    res = glmpca(joint_matrix.T, latent_dim, penalty=1, verbose=verbose)
+    res = glmpca(
+        joint_matrix.T,
+        latent_dim,
+        penalty=1,
+        verbose=verbose,
+        ctl={"maxIter": maxIter, "eps": eps, "optimizeTheta": optimizeTheta},
+    )
     # res = glmpca(joint_matrix.T, latent_dim, fam='nb', penalty=1, verbose=True)
     reduced_joint_matrix = res["factors"]
     # print("GLM-PCA finished with joint matrix shape " + str(reduced_joint_matrix.shape))
@@ -214,3 +233,29 @@ def kl_divergence_backend(X, Y):
     X_log_X = nx.reshape(X_log_X, (1, X_log_X.shape[0]))
     D = X_log_X.T - nx.dot(X, log_Y.T)
     return nx.to_numpy(D)
+
+
+def dissimilarity_metric(which, sliceA, sliceB, A, B, **kwargs):
+    match which:
+        case "euc" | "euclidean":
+            return scipy.spatial.distance.cdist(A, B)
+        case "gkl":
+            s_A = A + 0.01
+            s_B = B + 0.01
+            M = generalized_kl_divergence(s_A, s_B)
+            M /= M[M > 0].max()
+            M *= 10
+            return M
+        case "kl":
+            s_A = A + 0.01
+            s_B = B + 0.01
+            M = kl_divergence(s_A, s_B)
+            return M
+        case "selection_kl":
+            return high_umi_gene_distance(A, B, 2000)
+        case "pca":
+            return pca_distance(sliceA, sliceB, 2000, 20)
+        case "glmpca":
+            return glmpca_distance(A, B, **kwargs)
+        case _:
+            raise RuntimeError(f"Error: Invalid dissimilarity metric {which}")
diff --git a/src/paste3/model_selection.py b/src/paste3/model_selection.py
@@ -144,95 +144,6 @@ def calculate_convex_hull_edge_inconsistency(sliceA, sliceB, pi):
     return measure_A, measure_B
 
 
-"""
-Main function.
-"""
-
-
-def select_overlap_fraction(sliceA, sliceB, alpha=0.1):
-    """
-    Estimates the overlap percentage of two ST slices.
-
-    param: sliceA - AnnData object
-    param: sliceB - AnnData object
-    param: alpha - Alignment tuning parameter. Note: 0 ≤ alpha ≤ 1
-
-    return: estimation of the overlap percentage between sliceA and sliceB
-    """
-    print("PASTE2 model selection procedure.")
-    overlap_to_check = [
-        0.99,
-        0.95,
-        0.9,
-        0.85,
-        0.8,
-        0.75,
-        0.7,
-        0.65,
-        0.6,
-        0.55,
-        0.5,
-        0.45,
-        0.4,
-        0.35,
-        0.3,
-        0.25,
-        0.2,
-        0.15,
-        0.1,
-        0.05,
-    ]
-    # subset for common genes
-    common_genes = intersect(sliceA.var.index, sliceB.var.index)
-    sliceA = sliceA[:, common_genes]
-    sliceB = sliceB[:, common_genes]
-    # Get transport cost matrix
-    A_X, B_X = (
-        to_dense_array(extract_data_matrix(sliceA, None)),
-        to_dense_array(extract_data_matrix(sliceB, None)),
-    )
-    M = glmpca_distance(A_X, B_X, latent_dim=50, filter=True, verbose=True)
-    # Get an alignment for each overlap percentage
-    m_to_pi = {}
-    for m in overlap_to_check:
-        print("Running PASTE2 with s = " + str(m) + "...")
-        pi, log = partial_pairwise_align_given_cost_matrix(
-            sliceA,
-            sliceB,
-            s=m,
-            M=M,
-            alpha=alpha,
-            armijo=False,
-            norm=True,
-            return_obj=True,
-            verbose=False,
-        )
-        m_to_pi[m] = pi
-    # Model selection based on edge inconsistency score
-    m_to_edge_inconsistency_A = []
-    m_to_edge_inconsistency_B = []
-    for m in overlap_to_check:
-        pi = m_to_pi[m]
-        sliceA_measure, sliceB_measure = calculate_convex_hull_edge_inconsistency(
-            sliceA, sliceB, pi
-        )
-        m_to_edge_inconsistency_A.append(sliceA_measure)
-        m_to_edge_inconsistency_B.append(sliceB_measure)
-
-    half_estimate_A = overlap_to_check[
-        m_to_edge_inconsistency_A.index(max(m_to_edge_inconsistency_A))
-    ]
-    half_estimate_B = overlap_to_check[
-        m_to_edge_inconsistency_B.index(max(m_to_edge_inconsistency_B))
-    ]
-
-    print(
-        "Estimation of overlap percentage is "
-        + str(min(2 * min(half_estimate_A, half_estimate_B), 1))
-    )
-    return min(2 * min(half_estimate_A, half_estimate_B), 1)
-
-
 def plot_edge_curve(m_list, source_list, target_list):
     fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
     ax1.plot(m_list, source_list)
@@ -252,7 +163,7 @@ def plot_edge_curve(m_list, source_list, target_list):
     plt.show()
 
 
-def select_overlap_fraction_plotting(sliceA, sliceB, alpha=0.1, show_plot=True):
+def select_overlap_fraction(sliceA, sliceB, alpha=0.1, show_plot=True, numItermax=1000):
     overlap_to_check = [
         0.99,
         0.95,
@@ -284,7 +195,7 @@ def select_overlap_fraction_plotting(sliceA, sliceB, alpha=0.1, show_plot=True):
         to_dense_array(extract_data_matrix(sliceA, None)),
         to_dense_array(extract_data_matrix(sliceB, None)),
     )
-    M = glmpca_distance(A_X, B_X, latent_dim=50, filter=True)
+    M = glmpca_distance(A_X, B_X, latent_dim=50, filter=True, maxIter=numItermax)
 
     m_to_pi = {}
     for m in overlap_to_check:
@@ -299,6 +210,7 @@ def select_overlap_fraction_plotting(sliceA, sliceB, alpha=0.1, show_plot=True):
             norm=True,
             return_obj=True,
             verbose=False,
+            numItermax=numItermax,
         )
         m_to_pi[m] = pi