Merge pull request #27 from robinsonlabuzh/join-counts

adapted python join counts
robinsonlabuzh · Feb 4, 2025 · 2fcfd19 · 2fcfd19
2 parents 2d36efd + 42f4f3f
commit 2fcfd19
Show file tree

Hide file tree

Showing 7 changed files with 498 additions and 429 deletions.
diff --git a/code/00-overview-latSOD.qmd b/code/00-overview-latSOD.qmd
@@ -592,15 +592,20 @@ There also exist methods to detect spatial autocorrelation of categorical variab
 ## `R`
 
 ```{r, eval = TRUE}
+library(BiocNeighbors)
+library(BiocSingular)
+
+set.seed(123)
 # Run PCA on the sample
-sfe <- runPCA(sfe_tissue, ncomponents = 50)
+sfe <- runPCA(sfe_tissue, exprs_values = "logcounts", ncomponents = 50, BSPARAM = IrlbaParam())
 # Cluster based on first 20 PC's and using leiden
 colData(sfe_tissue)$cluster <- clusterRows(reducedDim(sfe, "PCA")[,1:10],
                                     BLUSPARAM = KNNGraphParam(
                                       k = 20,
+                                      BNPARAM=AnnoyParam(ntrees=50),
                                       cluster.fun = "leiden",
                                       cluster.args = list(
-                                          resolution_parameter = 0.3,
+                                          resolution = 0.3,
                                           objective_function = "modularity")))
 ```
 
@@ -609,12 +614,15 @@ We can visually inspect the spatial arrangement of the clusters. We see that the
 ## `Python`
 
 ```{python}
+from sklearn_ann.kneighbors.annoy import AnnoyTransformer
+
+np.random.seed(123)
 #compute a PCA on the 
-sc.pp.pca(adata, n_comps = 50)
+sc.pp.pca(adata, n_comps = 50, zero_center = True, svd_solver = "arpack")
 #compute the neighbours
-sc.pp.neighbors(adata, n_neighbors = 20, knn = True, n_pcs = 10)
+sc.pp.neighbors(adata, use_rep = "X_pca", knn = True, n_pcs = 10, transformer=AnnoyTransformer(n_neighbors=20, n_trees=50))
 #compute leiden clustering
-sc.tl.leiden(adata, resolution = 0.3)
+sc.tl.leiden(adata, resolution = 0.3, flavor = "igraph", objective_function = "modularity")
 ```
 
 :::
@@ -635,6 +643,7 @@ plotSpatialFeature(sfe_tissue,
 ```{python}
 sq.pl.spatial_scatter(adata, color="leiden", palette=ListedColormap(cmap_discrete.colors[:len(np.unique(adata.obs["leiden"]))]), library_id="spatial", title="Clusters", shape=None, size=55)
 plt.gca().set_axis_off()
+plt.show()
 ```
 
 :::
@@ -656,14 +665,19 @@ joincount.multi(colData(sfe_tissue)$cluster,
 
 ```{python}
 sq.gr.interaction_matrix(adata, "leiden", normalized = False, connectivity_key="spatial", weights = False)
-df_join_counts_multivariate = pd.DataFrame(adata.uns["leiden_interactions"], columns=np.unique(adata.obs["leiden"]), index=np.unique(adata.obs["leiden"]))
-df_join_counts_multivariate/2
+df_interactions = pd.DataFrame(adata.uns["leiden_interactions"], columns=np.unique(adata.obs["leiden"]), index=np.unique(adata.obs["leiden"]))
+# add lower triangular matrix (w/o diagonal) to the dataframe and divide by 2
+array_join_counts = (df_interactions + np.tril(df_interactions, k = -1).T)/2
+#only print the upper triangular matrix
+np.triu(array_join_counts)
 ```
 
 ::: 
 
 For example, we can see here that the calculated number of contacts of clusters 1 and 3 is far below the expected value. On the other hand all clusters are in contact with itself more often than expected at random.
 
+Therefore, in `Python` we add the lower triangular matrix to the upper triangle (without the diagonal) and divide the resulting interaction matrix by 2 as the interactions in `R` are undirected.
+
 # Summary and Considerations
 
 -   Lattice data refers to spatial data collected at fixed locations arranged in regular or irregular grids, contrasting with stochastic point pattern analysis.

diff --git a/code/06-HTS-multivar-latSOD.qmd b/code/06-HTS-multivar-latSOD.qmd
@@ -126,7 +126,7 @@ plotsize = 3
 # predefine genes
 features = ["Slc5a12", "Calr"]
 figsize = (len(features)*5, 7)
-pointsize = 30
+pointsize = 45
 ```
 
 :::

diff --git a/code/theory/05-theory-multivar-lattice.qmd b/code/theory/05-theory-multivar-lattice.qmd
@@ -423,32 +423,43 @@ First, we need to get categorical marks for each data point. We do so by running
 ### `R`
 
 ```{r}
+#| warning: false
+#| fig-width: 15
+#| fig-height: 10
+library(BiocNeighbors)
+library(BiocSingular)
+
+set.seed(123)
 # Run PCA on the sample
-sfe <- runPCA(sfe, ncomponents = 50)
+sfe <- runPCA(sfe, exprs_values = "logcounts", ncomponents = 50, BSPARAM = IrlbaParam())
 # Cluster based on first 20 PC's and using leiden
 colData(sfe)$cluster <- clusterRows(reducedDim(sfe, "PCA")[,1:10],
                                     BLUSPARAM = KNNGraphParam(
-                                        k = 20,
-                                        cluster.fun = "leiden",
-                                        cluster.args = list(
-                                            resolution_parameter = 0.3,
-                                            objective_function = "modularity")))
+                                      k = 20,
+                                      BNPARAM=AnnoyParam(ntrees=50),
+                                      cluster.fun = "leiden",
+                                      cluster.args = list(
+                                          resolution = 0.3,
+                                          objective_function = "modularity")))
 
 plotSpatialFeature(sfe,
   "cluster",
-  colGeometryName = colGeometryName, size = plotsize - 1
+  colGeometryName = colGeometryName, size = plotsize
 )
 ```
 
 ### `Python`
 
 ```{python}
+from sklearn_ann.kneighbors.annoy import AnnoyTransformer
+
+np.random.seed(123)
 #compute a PCA on the 
-sc.pp.pca(adata, n_comps = 50)
+sc.pp.pca(adata, n_comps = 50, zero_center = True, svd_solver = "arpack")
 #compute the neighbours
-sc.pp.neighbors(adata, n_neighbors = 20, knn = True, n_pcs = 10)
+sc.pp.neighbors(adata, use_rep = "X_pca", knn = True, n_pcs = 10, transformer=AnnoyTransformer(n_neighbors=20, n_trees=50))
 #compute leiden clustering
-sc.tl.leiden(adata, resolution = 0.3)
+sc.tl.leiden(adata, resolution = 0.3, flavor = "igraph", objective_function = "modularity")
 
 fig, ax = plt.subplots(1, 1, figsize=figsize, layout = "tight")
 sq.pl.spatial_scatter(
@@ -481,13 +492,16 @@ joincount.multi(colData(sfe)$cluster,
 
 ```{python, eval = TRUE}
 sq.gr.interaction_matrix(adata, "leiden", normalized = False, connectivity_key="spatial", weights = False)
-df_join_counts_multivariate = pd.DataFrame(adata.uns["leiden_interactions"], columns=np.unique(adata.obs["leiden"]), index=np.unique(adata.obs["leiden"]))
-df_join_counts_multivariate/2
+df_interactions = pd.DataFrame(adata.uns["leiden_interactions"], columns=np.unique(adata.obs["leiden"]), index=np.unique(adata.obs["leiden"]))
+# add lower triangular matrix (w/o diagonal) to the dataframe and divide by 2
+array_join_counts = (df_interactions + np.tril(df_interactions, k = -1).T)/2
+#only print the upper triangular matrix
+np.triu(array_join_counts)
 ```
 
 ::: 
 
-The `Python` function `sq.gr.interaction_matrix` counts the interaction for each pair twice, while the `R` function `joincount.multi` counts each interaction only once. Therefore, we divide the resulting interaction matrix in `Python` by 2. Since highly variable gene selection and clustering might have different implementations, the results are not perfectly corresponding, c.f. @richImpactPackageSelection2024.
+The `Python` function `sq.gr.interaction_matrix` counts the interaction for each pair twice, while the `R` function `joincount.multi` counts each interaction only once. Therefore, in `Python` we add the lower triangular matrix to the upper triangle (without the diagonal) and divide the resulting interaction matrix by 2. Since there are differences in the implementation of the principal component calculcation (namely in the SVD decomposition of the sparse logcounts matrix), the results are not perfectly corresponding, c.f. @richImpactPackageSelection2024.
 
 The rows show different combinations of clusters that are in physical contact. E.g. $1:1$ means the cluster $1$ with itself. The column `Joincount` is the observed statistic whereas the column `Expected` is the expected value of the statistic for this combination. Like this, we can compare whether contacts among cluster combinations occur more frequently than expected at random [@cliff1981spatial].
 

diff --git a/docs/00-overview-latSOD.html b/docs/00-overview-latSOD.html
diff --git a/docs/04-imaging-multivar-latSOD.html b/docs/04-imaging-multivar-latSOD.html
diff --git a/docs/06-HTS-multivar-latSOD.html b/docs/06-HTS-multivar-latSOD.html
diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ aiohttp==3.11.10
 aioitertools==0.12.0
 aiosignal==1.3.1
 anndata==0.11.1
+annoy==1.17.3
 array_api_compat==1.9.1
 asciitree==0.3.3
 async-timeout==5.0.1
@@ -96,6 +97,7 @@ seaborn==0.13.2
 session-info==1.0.0
 shapely==2.0.6
 six==1.17.0
+sklearn-ann==0.1.2
 slicerator==1.1.0
 sortedcontainers==2.4.0
 soupsieve==2.6