Skip to content

Commit

Permalink
Speed up tests (#56)
Browse files Browse the repository at this point in the history
* not parametrizing on gpu_verbose; always testing on use_gpu=True ;  if TorchBackend works for ot, lets just use that backend; testing on fewer iteration wherever we can; minor refactorings for test speedup; pytest-xdist for parallel execution; correct patching of a test function; not using parallel pytest for coverage generation; removed custom runner now that tests are relatively fast;  reduced iterations; decomposed tests to test dissimilarity_metric separately
  • Loading branch information
vineetbansal authored Oct 6, 2024
1 parent 1c229d9 commit 67a751e
Show file tree
Hide file tree
Showing 31 changed files with 5,050 additions and 24,282 deletions.
10 changes: 8 additions & 2 deletions .github/workflows/test_pinned_deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,21 @@ jobs:
- name: Install package
# Note: editable install for the coverage step to pick up source
# correctly. (coverage run --source=src/paste3 -m pytest)
run: pip install -e .
run: pip install -e . --no-deps

- name: Pre-commit checks
run: pre-commit run --all-files

- name: Check Versions of Dependencies
- name: List dependencies
run: pip freeze

- name: Pytest
if: matrix.os != 'ubuntu-latest'
run: pytest -n auto

- name: Pytest with coverage
if: matrix.os == 'ubuntu-latest'
# Note: Use of pytest -n .. (pytest-xdist) does not work with coverage
run: coverage run --source=src/paste3 -m pytest

- name: Upload coverage to Coveralls
Expand Down
44 changes: 0 additions & 44 deletions .github/workflows/test_pinned_deps_gpu.yml

This file was deleted.

4 changes: 2 additions & 2 deletions .github/workflows/test_unpinned_deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ jobs:
- name: Install Developer Dependencies
run: pip install .[dev]

- name: Check Versions of Dependencies
- name: List dependencies
run: pip freeze

- name: Pytest
run: pytest
run: pytest -n auto
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ addopts = "-sv"
dev = [
"build",
"pytest",
"pytest-xdist",
"coverage",
"coveralls",
"ruff",
Expand Down
101 changes: 81 additions & 20 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,24 +1,85 @@
anndata==0.10.8
scanpy==1.10.2
POT==0.9.4
numpy==1.26.4
scipy==1.14.1
scikit-learn==1.5.1
IPython==8.27.0
pytest==8.3.2
seaborn==0.13.2
matplotlib==3.9.2
pandas==2.2.2
statsmodels==0.14.2
networkx==3.3
sphinx==7.4.7
myst-parser==4.0.0
sphinx-autodoc-typehints==2.2.3
nbsphinx==0.9.5
sphinx-gallery==0.17.1
sphinx-rtd-theme==2.0.0
anndata==0.10.9
array_api_compat==1.9
asttokens==2.4.1
build==1.2.2
certifi==2024.8.30
cfgv==3.4.0
charset-normalizer==3.3.2
contourpy==1.3.0
coverage==7.6.1
coveralls==4.0.1
ruff==0.6.6
cycler==0.12.1
decorator==5.1.1
distlib==0.3.8
docopt==0.6.2
execnet==2.1.1
executing==2.1.0
filelock==3.16.1
fonttools==4.54.1
fsspec==2024.9.0
h5py==3.12.1
identify==2.6.1
idna==3.10
iniconfig==2.0.0
ipython==8.28.0
jedi==0.19.1
Jinja2==3.1.4
joblib==1.4.2
kiwisolver==1.4.7
legacy-api-wrap==1.4
llvmlite==0.43.0
MarkupSafe==2.1.5
matplotlib==3.9.2
matplotlib-inline==0.1.7
mpmath==1.3.0
natsort==8.4.0
networkx==3.3
nodeenv==1.9.1
numba==0.60.0
numpy==1.26.4
packaging==24.1
pandas==2.2.3
parso==0.8.4
patsy==0.5.6
pexpect==4.9.0
pillow==10.4.0
platformdirs==4.3.6
pluggy==1.5.0
POT==0.9.4
pre-commit==3.8.0
prompt_toolkit==3.0.48
ptyprocess==0.7.0
pure_eval==0.2.3
Pygments==2.18.0
pynndescent==0.5.13
pyparsing==3.1.4
pyproject_hooks==1.2.0
pytest==8.3.3
pytest-xdist==3.6.1
python-dateutil==2.9.0.post0
pytz==2024.2
PyYAML==6.0.2
requests==2.32.3
ruff==0.6.8
scanpy==1.10.3
scikit-learn==1.5.2
scipy==1.14.1
seaborn==0.13.2
session_info==1.0.0
setuptools==75.1.0
six==1.16.0
stack-data==0.6.3
statsmodels==0.14.4
stdlib-list==0.10.0
sympy==1.13.3
threadpoolctl==3.5.0
torch==2.4.1
tqdm==4.66.5
traitlets==5.14.3
typing_extensions==4.12.2
tzdata==2024.2
umap-learn==0.5.6
urllib3==2.2.3
virtualenv==20.26.6
wcwidth==0.2.13
wheel==0.44.0
49 changes: 47 additions & 2 deletions src/paste3/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,25 @@ def generalized_kl_divergence(X, Y):
return np.asarray(D)


def glmpca_distance(X, Y, latent_dim=50, filter=True, verbose=True):
def glmpca_distance(
X,
Y,
latent_dim=50,
filter=True,
verbose=True,
maxIter=1000,
eps=1e-4,
optimizeTheta=True,
):
"""
param: X - np array with dim (n_samples by n_features)
param: Y - np array with dim (m_samples by n_features)
param: latent_dim - number of latent dimensions in glm-pca
param: filter - whether to first select genes with highest UMI counts
param: verbose - whether to print glmpca progress
param maxIter - maximum number of iterations for glmpca
param eps - convergence threshold for glmpca
param optimizeTheta - whether to optimize overdispersion in glmpca
"""
assert X.shape[1] == Y.shape[1], "X and Y do not have the same number of features."

Expand All @@ -66,7 +79,13 @@ def glmpca_distance(X, Y, latent_dim=50, filter=True, verbose=True):
joint_matrix = joint_matrix[:, top_indices]

print("Starting GLM-PCA...")
res = glmpca(joint_matrix.T, latent_dim, penalty=1, verbose=verbose)
res = glmpca(
joint_matrix.T,
latent_dim,
penalty=1,
verbose=verbose,
ctl={"maxIter": maxIter, "eps": eps, "optimizeTheta": optimizeTheta},
)
# res = glmpca(joint_matrix.T, latent_dim, fam='nb', penalty=1, verbose=True)
reduced_joint_matrix = res["factors"]
# print("GLM-PCA finished with joint matrix shape " + str(reduced_joint_matrix.shape))
Expand Down Expand Up @@ -214,3 +233,29 @@ def kl_divergence_backend(X, Y):
X_log_X = nx.reshape(X_log_X, (1, X_log_X.shape[0]))
D = X_log_X.T - nx.dot(X, log_Y.T)
return nx.to_numpy(D)


def dissimilarity_metric(which, sliceA, sliceB, A, B, **kwargs):
match which:
case "euc" | "euclidean":
return scipy.spatial.distance.cdist(A, B)
case "gkl":
s_A = A + 0.01
s_B = B + 0.01
M = generalized_kl_divergence(s_A, s_B)
M /= M[M > 0].max()
M *= 10
return M
case "kl":
s_A = A + 0.01
s_B = B + 0.01
M = kl_divergence(s_A, s_B)
return M
case "selection_kl":
return high_umi_gene_distance(A, B, 2000)
case "pca":
return pca_distance(sliceA, sliceB, 2000, 20)
case "glmpca":
return glmpca_distance(A, B, **kwargs)
case _:
raise RuntimeError(f"Error: Invalid dissimilarity metric {which}")
94 changes: 3 additions & 91 deletions src/paste3/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,95 +144,6 @@ def calculate_convex_hull_edge_inconsistency(sliceA, sliceB, pi):
return measure_A, measure_B


"""
Main function.
"""


def select_overlap_fraction(sliceA, sliceB, alpha=0.1):
"""
Estimates the overlap percentage of two ST slices.
param: sliceA - AnnData object
param: sliceB - AnnData object
param: alpha - Alignment tuning parameter. Note: 0 ≤ alpha ≤ 1
return: estimation of the overlap percentage between sliceA and sliceB
"""
print("PASTE2 model selection procedure.")
overlap_to_check = [
0.99,
0.95,
0.9,
0.85,
0.8,
0.75,
0.7,
0.65,
0.6,
0.55,
0.5,
0.45,
0.4,
0.35,
0.3,
0.25,
0.2,
0.15,
0.1,
0.05,
]
# subset for common genes
common_genes = intersect(sliceA.var.index, sliceB.var.index)
sliceA = sliceA[:, common_genes]
sliceB = sliceB[:, common_genes]
# Get transport cost matrix
A_X, B_X = (
to_dense_array(extract_data_matrix(sliceA, None)),
to_dense_array(extract_data_matrix(sliceB, None)),
)
M = glmpca_distance(A_X, B_X, latent_dim=50, filter=True, verbose=True)
# Get an alignment for each overlap percentage
m_to_pi = {}
for m in overlap_to_check:
print("Running PASTE2 with s = " + str(m) + "...")
pi, log = partial_pairwise_align_given_cost_matrix(
sliceA,
sliceB,
s=m,
M=M,
alpha=alpha,
armijo=False,
norm=True,
return_obj=True,
verbose=False,
)
m_to_pi[m] = pi
# Model selection based on edge inconsistency score
m_to_edge_inconsistency_A = []
m_to_edge_inconsistency_B = []
for m in overlap_to_check:
pi = m_to_pi[m]
sliceA_measure, sliceB_measure = calculate_convex_hull_edge_inconsistency(
sliceA, sliceB, pi
)
m_to_edge_inconsistency_A.append(sliceA_measure)
m_to_edge_inconsistency_B.append(sliceB_measure)

half_estimate_A = overlap_to_check[
m_to_edge_inconsistency_A.index(max(m_to_edge_inconsistency_A))
]
half_estimate_B = overlap_to_check[
m_to_edge_inconsistency_B.index(max(m_to_edge_inconsistency_B))
]

print(
"Estimation of overlap percentage is "
+ str(min(2 * min(half_estimate_A, half_estimate_B), 1))
)
return min(2 * min(half_estimate_A, half_estimate_B), 1)


def plot_edge_curve(m_list, source_list, target_list):
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
ax1.plot(m_list, source_list)
Expand All @@ -252,7 +163,7 @@ def plot_edge_curve(m_list, source_list, target_list):
plt.show()


def select_overlap_fraction_plotting(sliceA, sliceB, alpha=0.1, show_plot=True):
def select_overlap_fraction(sliceA, sliceB, alpha=0.1, show_plot=True, numItermax=1000):
overlap_to_check = [
0.99,
0.95,
Expand Down Expand Up @@ -284,7 +195,7 @@ def select_overlap_fraction_plotting(sliceA, sliceB, alpha=0.1, show_plot=True):
to_dense_array(extract_data_matrix(sliceA, None)),
to_dense_array(extract_data_matrix(sliceB, None)),
)
M = glmpca_distance(A_X, B_X, latent_dim=50, filter=True)
M = glmpca_distance(A_X, B_X, latent_dim=50, filter=True, maxIter=numItermax)

m_to_pi = {}
for m in overlap_to_check:
Expand All @@ -299,6 +210,7 @@ def select_overlap_fraction_plotting(sliceA, sliceB, alpha=0.1, show_plot=True):
norm=True,
return_obj=True,
verbose=False,
numItermax=numItermax,
)
m_to_pi[m] = pi

Expand Down
Loading

0 comments on commit 67a751e

Please sign in to comment.