From f56ef1d530440407802ab897635f867dea0cd88f Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Mon, 19 Feb 2024 07:32:05 -0600 Subject: [PATCH 1/2] Update pkgdown page --- NEWS.md | 2 +- R/positionalProperty.R | 2 +- README.md | 2 +- _pkgdown.yml | 3 + inst/pkgdown.yml | 3 +- vignettes/articles/Installation.Rmd | 4 +- vignettes/articles/Running_Escape.Rmd | 492 ++++++++++++++++++++++++++ 7 files changed, 502 insertions(+), 6 deletions(-) create mode 100644 vignettes/articles/Running_Escape.Rmd diff --git a/NEWS.md b/NEWS.md index 207ec27b..3f587a87 100644 --- a/NEWS.md +++ b/NEWS.md @@ -27,7 +27,7 @@ * **.theCall()** now allows for a custom header/variable and checks the colnames. * Replaced data arguments to be more descriptive: *df* is now *input.data*, *dir* is now *input*, and *sc* is now *sc.data* * Deep clean on the documentation for each function for increased consistency and explainability -* ```Startracdiversity()``` metric re-implemented to remove startrac-class object intermediary +* ```StartracDiversity()``` metric re-implemented to remove startrac-class object intermediary * Implemented powerTCR locally to reduce dependencies and continue support * Universalized underlying function language and intermediate variables * License change to MIT diff --git a/R/positionalProperty.R b/R/positionalProperty.R index d03f0cbe..0ea65bf2 100644 --- a/R/positionalProperty.R +++ b/R/positionalProperty.R @@ -67,7 +67,7 @@ positionalProperty <- function(input.data, input.data <- .groupList(input.data, group.by) } - #Selecting Diversit Function + #Selecting Property Function propertyFunc <- switch(method, "Atchley" = .af.ref, "Kidera" = .kf.ref, diff --git a/README.md b/README.md index 72efe219..b440a09b 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ devtools::install_github("ncborcherding/scRepertoire") ``` ### Installing from Bioconductor -The current version of scRepertoire is also available in the development version of Bioconductor. Important to note, the version is listed as 1.99.0 on [Bioconductor](https://bioconductor.org/packages/3.19/bioc/html/scRepertoire.html) per their version guidelines. +The current version of scRepertoire is also available in the development version of Bioconductor. Important to note, the version is listed as 1.99.1 on [Bioconductor](https://bioconductor.org/packages/3.19/bioc/html/scRepertoire.html) per their version guidelines. ```R BiocManager::install(version='devel') diff --git a/_pkgdown.yml b/_pkgdown.yml index 89a2a379..782a0aff 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -51,6 +51,8 @@ navbar: href: articles/Trex.html - text: Combining Deep Learning and BCRs with Ibex href: articles/Ibex.html + - text: Single-cell Gene Set Enrichment Analysis + href: articles/Running_Escape.html - text: News/Changelog href: news/index.html - text: Reference @@ -89,6 +91,7 @@ reference: - contents: - percentAA - positionalEntropy + - positionalProperty - percentGenes - percentKmer - percentVJ diff --git a/inst/pkgdown.yml b/inst/pkgdown.yml index f9da6cbb..1be664de 100644 --- a/inst/pkgdown.yml +++ b/inst/pkgdown.yml @@ -15,9 +15,10 @@ articles: Loading: Loading.html Processing: Processing.html Repertoire_Summary: Repertoire_Summary.html + Running_Escape: Running_Escape.html SC_Visualizations: SC_Visualizations.html Trex: Trex.html -last_built: 2024-01-22T10:54Z +last_built: 2024-02-19T12:03Z urls: reference: https://www.borch.dev/uploads/scRepertoire/reference article: https://www.borch.dev/uploads/scRepertoire/articles diff --git a/vignettes/articles/Installation.Rmd b/vignettes/articles/Installation.Rmd index cbbf9904..2d2c8a2c 100644 --- a/vignettes/articles/Installation.Rmd +++ b/vignettes/articles/Installation.Rmd @@ -55,9 +55,9 @@ remotes::install_github(repo = "ncborcherding/scRepertoire", ref = "dev") ## Bioconductor -The current version of scRepertoire is also available in the development version of Bioconductor. Important to note, the version is listed as 1.99.0 on [Bioconductor](https://bioconductor.org/packages/3.19/bioc/html/scRepertoire.html) per their version guidelines. +The current version of scRepertoire is also available in the development version of Bioconductor. Important to note, the version is listed as 1.99.1 on [Bioconductor](https://bioconductor.org/packages/3.19/bioc/html/scRepertoire.html) per their version guidelines. -```{r} +``` BiocManager::install(version='devel') BiocManager::install("scRepertoire") diff --git a/vignettes/articles/Running_Escape.Rmd b/vignettes/articles/Running_Escape.Rmd new file mode 100644 index 00000000..2d8b58e9 --- /dev/null +++ b/vignettes/articles/Running_Escape.Rmd @@ -0,0 +1,492 @@ +--- +title: "Single-cell Gene Set Enrichment Analysis" +date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' +output: rmarkdown::html_vignette +theme: united +df_print: kable +vignette: > + %\VignetteIndexEntry{Summarizing Repertoires} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +```{r setup, include=FALSE} +all_times <- list() # store the time for each chunk +knitr::knit_hooks$set(time_it = local({ + now <- NULL + function(before, options) { + if (before) { + now <<- Sys.time() + } else { + res <- difftime(Sys.time(), now, units = "secs") + all_times[[options$label]] <<- res + } + } +})) +knitr::opts_chunk$set( + tidy = TRUE, + tidy.opts = list(width.cutoff = 95), + message = FALSE, + warning = FALSE, + time_it = TRUE +) + +suppressMessages(library(scRepertoire)) +suppressMessages(library(Seurat)) +suppressMessages(library(escape)) +suppressMessages(library(RColorBrewer)) +data("contig_list") +combined.TCR <- combineTCR(contig_list, + samples = c("P17B", "P17L", "P18B", "P18L", + "P19B","P19L", "P20B", "P20L")) + +scRep_example <- readRDS("scRep_example_full.rds") + +scRep_example <- combineExpression(combined.TCR, + scRep_example, + cloneCall="gene", + group.by = "sample") + +#Adding patient information +scRep_example$Patient <- substr(scRep_example$orig.ident, 1,3) + +#Adding type information +scRep_example$Type <- substr(scRep_example$orig.ident, 4,4) + +#Defining colors +colorblind_vector <- hcl.colors(n=7, palette = "inferno", fixup = TRUE) +``` + +# What is escape? + +*escape* is a R package designed for **E**asy **s**ingle-**c**ell **a**nalysis **p**latform for **e**nrichment. As the tortuous acronym implies, *escape* was designed as a user-friendly package for gene set enrichment analysis that leverages the heterogeneity of single-cell data. + +More information is available at the [GitHub Repo](https://github.com/ncborcherding/escape). + +## Citation +If using *escape*, please cite the [article](https://www.nature.com/articles/s42003-020-01625-6): Borcherding, N., Vishwakarma, A., Voigt, A.P. et al. Mapping the immune environment in clear cell renal carcinoma by single-cell genomics. Commun Biol 4, 122 (2021). https://doi.org/10.1038/s42003-020-01625-6. + +## Installing escape + +*escape* is available via github using: + +```{r eval=FALSE, tidy=FALSE} +devtools::install_github("ncborcherding/escape") +remotes::install_github("ncborcherding/escape") +``` + +For now, the newest version of *escape* is available in the Bioconductor dev version (3.19). + +```{r eval = FALSE, tidy=FALSE} +BiocManager::install(version='devel') +BiocManager::install("escape") +``` + +## Loading Data + +For the purposes of the vignette, we will use the scRepertoire full example data. More information is available [here](https://www.borch.dev/uploads/screpertoire/articles/loading#example-data-in-screpertoire). We will subset the data for patient 17 and 18 from the data set in order to speed up the calculations. + +```{r tidy = FALSE} +scRep_example <- subset(scRep_example, Patient %in% c("P17", "P18")) +``` + +# Getting Gene Sets + +## Option 1: Molecular Signature Database + +The first step in the process of performing gene set enrichment analysis is identifying the gene sets we would like to use. The function `getGeneSets()` allows users to isolate a whole or multiple libraries from a list of GSEABase GeneSetCollection objects. + +We can do this for gene set collections from the built-in [Molecular Signature Database](https://www.gsea-msigdb.org/gsea/msigdb/search.jsp) by setting the parameter **library** equal to library/libraries of interest. For multiple libraries, just set **library = c("Library1", "Library2", etc)**. + +Additional parameters include: + +* **subcategory** Can be used to select subcategories in the larger database. +* **gene.sets** Can select individual pathways/gene sets to be isolated. + +If the sequencing of the single-cell data is performed on a species other than "Homo sapiens", make sure to use the **species** parameter in `getGeneSets()` in order to get the correct gene nomenclature. + +```{r} +GS.hallmark <- getGeneSets(library = "H") +``` + +## Option 2: Built-In gene sets + +```{r, eval = FALSE, tidy=FALSE} +data("escape.gene.sets", package="escape") +gene.sets <- escape.gene.sets +``` + +## Option 3: Define personal gene sets + +```{r, eval=FALSE, tidy=FALSE} +gene.sets <- list(Bcells = c("MS4A1","CD79B","CD79A","IGH1","IGH2") + Myeloid = c("SPI1","FCER1G","CSF1R"), + Tcells = c("CD3E", "CD3D", "CD3G", "CD7","CD8A")) +``` + +# Performing Enrichment Calculation + +Several popular methods exist for Gene Set Enrichment Analysis (GSEA). These methods can vary in the underlying assumptions. *escape* incorporates several methods that are particularly advantageous for single-cell RNA values: + +### **ssGSEA** + +This method calculates the enrichment score using a rank-normalized approach and generating an empirical cumulative distribution function for each individual cell. The enrichment score is defined for a gene set (*G*) using the number of genes in the gene set (*NG*) and total number of genes (*N*). + +$$ +ES(G,S) \sum_{i = 1}^{n} [P_W^G(G,S,i)-P_{NG}(G,S,i)] +$$ +Please see the following [citation](https://pubmed.ncbi.nlm.nih.gov/19847166/) for more information. + +### **GSVA** + +GSVA varies slightly by estimating a Poisson-based kernel cumulative density function. But like ssGSEA, the ultimate enrichment score reported is based on the maximum value of the random walk statistic. GSVA appears to have better overall consistency and runs faster than ssGSEA. + +$$ +ES_{jk}^{max} = V_{jk} [max_{l=1,...,p}|v_{jk}(l)] +$$ +Please see the following [citation](https://pubmed.ncbi.nlm.nih.gov/23323831/) for more information. + +### **AUCell** + +In contrast to ssGSEA and GSVA, AUCell takes the gene rankings for each cell and step-wise plots the position of each gene in the gene set along the y-axis. The output score is the area under the curve for this approach. + +Please see the following [citation](https://pubmed.ncbi.nlm.nih.gov/28991892/) for more information. + +### UCell + +UCell calculates a Mann-Whitney U statistic based on the gene rank list. **Importantly**, UCell has a cut-off for ranked genes ($r_{max}$) at 1500 - this is per design as drop-out in single-cell can alter enrichment results. This also substantially speeds the calculations up. + +The enrichment score output is then calculated using the complement of the U statistic scaled by the gene set size and cut-off. + +$$ +U_j^` = 1-\frac{U_j}{n \bullet r_{max}} +$$ + + +Please see the following [citation](https://pubmed.ncbi.nlm.nih.gov/34285779/) for more information. + + +## escape.matrix + +escape has 2 major functions - the first being ```escape.matrix()```, which serves as the backbone of enrichment calculations. Using count-level data supplied from a single-cell object or matrix, ```escape.matrix()``` will produce an enrichment score for the individual cells with the gene sets selected and output the values as a matrix. + +**method** + +* AUCell +* GSVA +* ssGSEA +* UCell + +**groups** + +* The number of cells to calculate at once. + +**min.size** + +* The minimum size of detectable genes in a gene set. Gene sets less than the **min.size** will be removed before the calculation. + +**normalize** + +* Use the number of genes from the gene sets in each cell to normalize the enrichment scores. The default value is **FALSE**. + +**make.positive** + +* During normalization, whether to shift the enrichment values to a positive range (**TRUE**) or not (**FALSE**). The default value is **FALSE**. + +*Cautionary note:* **make.positive** was added to allow for differential analysis downstream of enrichment as some methods may produce negative values. It preserves log-fold change, but ultimately modifies the enrichment values and should be used with caution. + + +```{r tidy = FALSE, results='hide'} +enrichment.scores <- escape.matrix(scRep_example, + gene.sets = GS.hallmark, + groups = 5000, + min.size = 5) +``` + +We can visualize the raw results with a ggplot: + +```{r} +ggplot(data = as.data.frame(enrichment.scores), + mapping = aes(x = `HALLMARK-DNA-REPAIR`, `HALLMARK-G2M-CHECKPOINT`)) + + geom_point() + + theme_classic() + + theme() +``` + +## runEscape + +Alternatively, we can use ```runEscape()``` to calculate the enrichment score and directly attach the output to a single-cell object. The additional parameter for ```runEscape` is **new.assay.name**, in order to save the enrichment scores as a custom assay in the single-cell object. + + +```{r tidy = FALSE, results='hide'} +scRep_example <- runEscape(scRep_example, + method = "ssGSEA", + gene.sets = GS.hallmark, + groups = 5000, + min.size = 0, + new.assay.name = "escape.ssGSEA") +``` + +We can quickly examine the attached enrichment scores using the visualization/workflow we prefer - here we will use just `FeaturePlot()` from the Seurat R package. + +```{r tidy = FALSE} +#Define color palette +colorblind_vector <- hcl.colors(n=7, palette = "inferno", fixup = TRUE) + +FeaturePlot(scRep_example, "HALLMARK-APOPTOSIS") + + scale_color_gradientn(colors = colorblind_vector) + + theme(plot.title = element_blank()) +``` + +## performNormalization + +Although we glossed over the normalization that can be used in ```escape.matrix()``` and ```runEscape()```, it is worth mentioning here as normalization can affect all downstream analyses. + +There can be inherent bias in enrichment values due to drop out in single-cell expression data. Cells with larger numbers of features and counts will likely have higher enrichment values. ```performNormalization()``` will normalize the enrichment values by calculating the number of genes expressed in each gene set and cell. This is similar to the normalization in classic GSEA and it will be stored in a new assay. + + +```{r eval = F} +scRep_example <- performNormalization(scRep_example, + assay = "escape.ssGSEA", + gene.sets = GS.hallmark) +``` + +```performNormalization()``` has an additional parameter **make.positive**. Across the individual gene sets, if negative normalized enrichment scores are seen, the minimum value is added to all values. For example if the normalized enrichment scores (after the above accounting for drop out) ranges from -50 to 50, **make.positive** will adjust the range to 0 to 100 (by adding 50). This allows for compatible log2-fold change downstream, but can alter the enrichment score interpretation. + +**** + +# Visualizations + +There are a number of ways to look at the enrichment values downstream of ```runEscape()``` with the myriad plotting and visualizations functions/packages for single-cell data. *escape* include several additional plotting functions to assist in the analysis. + +## heatmapEnrichment + +We can examine the enrichment values across our gene sets by using ```heatmapEnrichment()```. This visualization will return the mean of the **group.by** variable. As a default - all visualizations of single-cell objects will use the cluster assignment or active identity as a default for visualizations. We will use a subset of the Hallmark Gene Sets to display for the vignette format using the **gene.set.use** set to first 12 gene sets. Alternatively, we can plot all gene sets using **gene.set.use** = "all". + +```{r} +heatmapEnrichment(scRep_example, + group.by = "ident", + gene.set.use = rownames(scRep_example@assays$escape.ssGSEA@data)[1:12], + assay = "escape.ssGSEA") +``` +Most of the visualizations in *escape* have a defined set of parameters. + +**group.by** + +* The grouping variable for the comparison. + +**facet.by** + +* Using a variable to facet the graph into separate visualizations. + +**scale** + +* **TRUE** - z-transform the enrichment values. +* **FALSE** - leave raw values (**DEFAULT**). + +In addition, ```heatmapEnrichment()``` allows for the reclustering of rows and columns using Euclidean distance of the enrichment scores and the Ward2 methods for clustering using **cluster.rows** and **cluster.columns**. + +```{r} +heatmapEnrichment(scRep_example, + group.by = "ident", + assay = "escape.ssGSEA", + gene.set.use = rownames(scRep_example@assays$escape.ssGSEA@data)[1:12], + scale = TRUE, + cluster.rows = TRUE, + cluster.columns = TRUE) +``` + +Each visualization has an additional argument called **palette that supplies the coloring scheme to be used - available color palettes can be viewed with ```hcl.pals()```. + +```{r} +hcl.pals() +``` + +```{r} +heatmapEnrichment(scRep_example, + group.by = "ident", + gene.set.use = rownames(scRep_example@assays$escape.ssGSEA@data)[1:12], + assay = "escape.ssGSEA", + palette = "Spectral") +``` + +Alternatively, we can add an additional layer to the ggplot object that is returned by the visualizations using something like ```scale_fill_gradientn()``` for continuous values or ```scale_fill_manual()``` for the categorical variables. + +```{r} +heatmapEnrichment(scRep_example, + group.by = "ident", + gene.set.use = rownames(scRep_example@assays$escape.ssGSEA@data)[1:12], + assay = "escape.ssGSEA") + + scale_fill_gradientn(colors = rev(brewer.pal(11, "RdYlBu"))) +``` + +## geyserEnrichment + +We can also focus on individual gene sets - one approach is to use ```geyserEnrichment()```. Here individual cells are plotted along the Y-axis with graphical summary where the central dot refers to the median enrichment value and the thicker/thinner lines demonstrate the interval summaries referring to the 66% and 95%. + +```{r} +geyserEnrichment(scRep_example, + assay = "escape.ssGSEA", + gene.set = "HALLMARK-INTERFERON-GAMMA-RESPONSE") +``` + +To show the additional parameters that appear in visualizations of individual enrichment gene sets - we can reorder the groups by the mean of the gene set using **order.by** = "mean". + +```{r} +geyserEnrichment(scRep_example, + assay = "escape.ssGSEA", + gene.set = "HALLMARK-INTERFERON-GAMMA-RESPONSE", + order.by = "mean") +``` + +What if we had 2 separate samples or groups within the data? Another parameter we can use is **facet.by** to allow for direct visualization of an additional variable. + +```{r} +geyserEnrichment(scRep_example, + assay = "escape.ssGSEA", + gene.set = "HALLMARK-INTERFERON-GAMMA-RESPONSE", + facet.by = "Type") +``` +Lastly, we can select the way the color is applied to the plot using the **color.by** parameter. Here we can set it to the gene set of interest *"HALLMARK-INTERFERON-GAMMA-RESPONSE"*. + +```{r} +geyserEnrichment(scRep_example, + assay = "escape.ssGSEA", + gene.set = "HALLMARK-INTERFERON-GAMMA-RESPONSE", + color.by = "HALLMARK-INTERFERON-GAMMA-RESPONSE") +``` + +## ridgeEnrichment + +Similar to the ```geyserEnrichment()``` the ```ridgeEnrichment()``` can display the distribution of enrichment values across the selected gene set. The central line is at the median value for the respective grouping. + +```{r} +ridgeEnrichment(scRep_example, + assay = "escape.ssGSEA", + gene.set = "HALLMARK-IL2-STAT5-SIGNALING") +``` +We can get the relative position of individual cells along the x-axis using the **add.rug** parameter. + +```{r} +ridgeEnrichment(scRep_example, + assay = "escape.ssGSEA", + gene.set = "HALLMARK-IL2-STAT5-SIGNALING", + add.rug = TRUE, + scale = TRUE) +``` + +## splitEnrichment + +Another distribution visualization is a violin plot, which we separate and directly compare using a binary classification. Like ```ridgeEnrichment()```, this allows for greater use of categorical variables. For ```splitEnrichment()```, the output will be two halves of a violin plot based on the **split.by** parameter with a central boxplot with the relative distribution across all samples. Here, we split the samples by *Type* where **B** refers to the peripheral blood and **L** refers to lung. + +```{r} +splitEnrichment(scRep_example, + assay = "escape.ssGSEA", + gene.set = "HALLMARK-IL2-STAT5-SIGNALING", + split.by = "Type") +``` + +## densityEnrichment + +```densityEnrichment()``` is a method to visualize the mean rank position of the gene set features along the total feature space by group. This is similar to traditional GSEA analysis, but is not calculating the walk-based enrichment score. As this function calculates mean ranks on the fly, it may take some time depending on the size of the single-cell object. + +**gene.set.use** + +* The selected gene set to visualize + +**gene.sets** + +* The gene set library from either of the 3 options in the first section of the vignette. + +```{r} +densityEnrichment(scRep_example, + gene.set.use = "HALLMARK-IL6-JAK-STAT3-SIGNALING", + gene.sets = GS.hallmark) +``` + +## scatterEnrichment + +It may be advantageous to look at the distribution of multiple gene sets - here we can use ```scatterEnrichment()``` for a 2 gene set comparison. The color values are based on the density of points determined by the number of neighbors, similar to the [Nebulosa R package](https://www.bioconductor.org/packages/release/bioc/html/Nebulosa.html). We just need to define which gene set to plot on the **x.axis** and which to plot on the **y.axis**. + + +```{r} +scatterEnrichment(scRep_example, + assay = "escape.ssGSEA", + x.axis = "HALLMARK-INTERFERON-GAMMA-RESPONSE", + y.axis = "HALLMARK-IL6-JAK-STAT3-SIGNALING") +``` + +The scatter plot can also be converted into a hexbin, another method for summarizing the individual cell distributions along the x and y axis, by setting **style** = "hex". + +```{r} +scatterEnrichment(scRep_example, + assay = "escape.ssGSEA", + x.axis = "HALLMARK-INTERFERON-GAMMA-RESPONSE", + y.axis = "HALLMARK-IL6-JAK-STAT3-SIGNALING", + style = "hex") +``` + +**** + +# Statistical Analysis + +## Principal Component Analysis (PCA) + +escape has its own PCA function ```performPCA()``` which will work on a single-cell object or a matrix of enrichment values. This is specifically useful for downstream visualizations as it stores the eigenvalues and rotations. If we want to look at the relative contribution to overall variance of each component or a Biplot-like overlay of the individual features, use ```performPCA()```. + +Alternatively, other PCA-based functions like Seurat's ```RunPCA()``` or scater's ```runPCA()` can be used. These functions are likely faster and would be ideal if we have a larger number of cells and/or gene sets. + +```{r} +scRep_example <- performPCA(scRep_example, + assay = "escape.ssGSEA", + n.dim = 1:10) +``` + +*escape* has a built in method for plotting PCA ```pcaEnrichment()``` that functions similarly to the ```scatterEnrichment()``` function where **x.axis** and **y.axis** are the components to plot. + +```{r} +pcaEnrichment(scRep_example, + dimRed = "escape.PCA", + x.axis = "PC1", + y.axis = "PC2") +``` + +```pcaEnrichment()``` can plot additional information on the principal component analysis. + +**add.percent.contribution** will add the relative percent contribution of the x and y.axis to total variability observed in the PCA. + +**display.factors** will overlay the magnitude and direction that the features/gene sets contribute to the selected components. The number of gene sets is determined by **number.of.factors**. This can assist in understanding the underlying differences in enrichment across different cells. + +```{r} +pcaEnrichment(scRep_example, + dimRed = "escape.PCA", + x.axis = "PC1", + y.axis = "PC2", + add.percent.contribution = TRUE, + display.factors = TRUE, + number.of.factors = 10) +``` + +## Differential Enrichment + +Differential enrichment analysis can be performed similar to differential gene expression analysis. For the purposes of finding the differential enrichment values, we can first normalize the enrichment values for the ssGSEA calculations. + +```{r} +scRep_example <- performNormalization(scRep_example, + assay = "escape.ssGSEA", + gene.sets = GS.hallmark, + make.positive = FALSE) + +all.markers <- FindAllMarkers(scRep_example, + assay = "escape.ssGSEA_normalized", + min.pct = 0, + logfc.threshold = 0) + +head(all.markers) +``` \ No newline at end of file From 427f525bb1b8042eec7b769523597bdd10281642 Mon Sep 17 00:00:00 2001 From: theHumanBorch Date: Sat, 24 Feb 2024 17:39:07 +0100 Subject: [PATCH 2/2] Updating vignette articles --- inst/pkgdown.yml | 2 +- vignettes/articles/Attaching_SC.Rmd | 2 - vignettes/articles/Clonal_Cluster.Rmd | 4 +- vignettes/articles/Ibex.Rmd | 2 +- vignettes/articles/Running_Escape.Rmd | 64 ++++++++++++++---------- vignettes/articles/SC_Visualizations.Rmd | 1 - vignettes/articles/Trex.Rmd | 2 +- 7 files changed, 41 insertions(+), 36 deletions(-) diff --git a/inst/pkgdown.yml b/inst/pkgdown.yml index 1be664de..16a3ebd6 100644 --- a/inst/pkgdown.yml +++ b/inst/pkgdown.yml @@ -18,7 +18,7 @@ articles: Running_Escape: Running_Escape.html SC_Visualizations: SC_Visualizations.html Trex: Trex.html -last_built: 2024-02-19T12:03Z +last_built: 2024-02-24T16:17Z urls: reference: https://www.borch.dev/uploads/scRepertoire/reference article: https://www.borch.dev/uploads/scRepertoire/articles diff --git a/vignettes/articles/Attaching_SC.Rmd b/vignettes/articles/Attaching_SC.Rmd index d928852e..b840c3ed 100644 --- a/vignettes/articles/Attaching_SC.Rmd +++ b/vignettes/articles/Attaching_SC.Rmd @@ -45,8 +45,6 @@ data("contig_list") combined.TCR <- combineTCR(contig_list, samples = c("P17B", "P17L", "P18B", "P18L", "P19B","P19L", "P20B", "P20L")) - -googledrive::drive_download("https://drive.google.com/file/d/1_YuRraDyg8UgF3oasjF0-jgPnwox-B24/view?usp=share_link", overwrite = TRUE) ``` The data in the scRepertoire package is derived from a [study](https://pubmed.ncbi.nlm.nih.gov/33622974/) of acute respiratory stress disorder in the context of bacterial and COVID-19 infections. The internal single cell data (`scRep_example()`) built in to scRepertoire is randomly sampled 500 cells from the fully integrated Seurat object to minimize the package size. However, for the purpose of the vignette we will use the full single-cell object with 30,000 cells. We will use both Seurat and Single-Cell Experiment (SCE) with scater to perform further visualizations in tandem. diff --git a/vignettes/articles/Clonal_Cluster.Rmd b/vignettes/articles/Clonal_Cluster.Rmd index 319577a8..93c2fa3f 100644 --- a/vignettes/articles/Clonal_Cluster.Rmd +++ b/vignettes/articles/Clonal_Cluster.Rmd @@ -42,9 +42,7 @@ suppressMessages(library(Seurat)) data("contig_list") combined.TCR <- combineTCR(contig_list, samples = c("P17B", "P17L", "P18B", "P18L", - "P19B","P19L", "P20B", "P20L")) - -googledrive::drive_download("https://drive.google.com/file/d/1_YuRraDyg8UgF3oasjF0-jgPnwox-B24/view?usp=share_link", overwrite = TRUE) + "P19B","P19L", "P20B", "P20L")) scRep_example <- readRDS("scRep_example_full.rds") diff --git a/vignettes/articles/Ibex.Rmd b/vignettes/articles/Ibex.Rmd index b64d8811..04608087 100644 --- a/vignettes/articles/Ibex.Rmd +++ b/vignettes/articles/Ibex.Rmd @@ -37,7 +37,7 @@ knitr::opts_chunk$set( time_it = TRUE ) suppressMessages(library(reticulate)) -use_condaenv(condaenv = "r-reticulate", required = TRUE) +#use_condaenv(condaenv = "r-reticulate", required = TRUE) suppressMessages(library(Ibex)) suppressMessages(library(Seurat)) suppressMessages(library(ggplot2)) diff --git a/vignettes/articles/Running_Escape.Rmd b/vignettes/articles/Running_Escape.Rmd index 2d8b58e9..368da6c4 100644 --- a/vignettes/articles/Running_Escape.Rmd +++ b/vignettes/articles/Running_Escape.Rmd @@ -5,7 +5,7 @@ output: rmarkdown::html_vignette theme: united df_print: kable vignette: > - %\VignetteIndexEntry{Summarizing Repertoires} + %\VignetteIndexEntry{Single-cell Gene Set Enrichment Analysis} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -211,7 +211,7 @@ enrichment.scores <- escape.matrix(scRep_example, We can visualize the raw results with a ggplot: -```{r} +```{r tidy=FALSE} ggplot(data = as.data.frame(enrichment.scores), mapping = aes(x = `HALLMARK-DNA-REPAIR`, `HALLMARK-G2M-CHECKPOINT`)) + geom_point() + @@ -251,12 +251,21 @@ Although we glossed over the normalization that can be used in ```escape.matrix( There can be inherent bias in enrichment values due to drop out in single-cell expression data. Cells with larger numbers of features and counts will likely have higher enrichment values. ```performNormalization()``` will normalize the enrichment values by calculating the number of genes expressed in each gene set and cell. This is similar to the normalization in classic GSEA and it will be stored in a new assay. -```{r eval = F} +```{r eval = F, tidy=FALSE} scRep_example <- performNormalization(scRep_example, assay = "escape.ssGSEA", gene.sets = GS.hallmark) ``` +An alternative for scaling by expressed gene sets would be to use a scaling factor previously calculated during normal single-cell data processing and quality control. This can be done using the **scale.factor** argument and providing a vector. This approach may be particularly advantageous for large data sets. + +```{r eval = FALSE, tidy = FALSE} +scRep_example <- performNormalization(scRep_example, + assay = "escape.ssGSEA", + gene.sets = GS.hallmark, + scale.factor = scRep_example$nFeature_RNA) +``` + ```performNormalization()``` has an additional parameter **make.positive**. Across the individual gene sets, if negative normalized enrichment scores are seen, the minimum value is added to all values. For example if the normalized enrichment scores (after the above accounting for drop out) ranges from -50 to 50, **make.positive** will adjust the range to 0 to 100 (by adding 50). This allows for compatible log2-fold change downstream, but can alter the enrichment score interpretation. **** @@ -269,7 +278,7 @@ There are a number of ways to look at the enrichment values downstream of ```run We can examine the enrichment values across our gene sets by using ```heatmapEnrichment()```. This visualization will return the mean of the **group.by** variable. As a default - all visualizations of single-cell objects will use the cluster assignment or active identity as a default for visualizations. We will use a subset of the Hallmark Gene Sets to display for the vignette format using the **gene.set.use** set to first 12 gene sets. Alternatively, we can plot all gene sets using **gene.set.use** = "all". -```{r} +```{r tidy=FALSE} heatmapEnrichment(scRep_example, group.by = "ident", gene.set.use = rownames(scRep_example@assays$escape.ssGSEA@data)[1:12], @@ -292,7 +301,7 @@ Most of the visualizations in *escape* have a defined set of parameters. In addition, ```heatmapEnrichment()``` allows for the reclustering of rows and columns using Euclidean distance of the enrichment scores and the Ward2 methods for clustering using **cluster.rows** and **cluster.columns**. -```{r} +```{r tidy=FALSE} heatmapEnrichment(scRep_example, group.by = "ident", assay = "escape.ssGSEA", @@ -308,7 +317,7 @@ Each visualization has an additional argument called **palette that supplies the hcl.pals() ``` -```{r} +```{r tidy=FALSE} heatmapEnrichment(scRep_example, group.by = "ident", gene.set.use = rownames(scRep_example@assays$escape.ssGSEA@data)[1:12], @@ -318,7 +327,7 @@ heatmapEnrichment(scRep_example, Alternatively, we can add an additional layer to the ggplot object that is returned by the visualizations using something like ```scale_fill_gradientn()``` for continuous values or ```scale_fill_manual()``` for the categorical variables. -```{r} +```{r tidy=FALSE} heatmapEnrichment(scRep_example, group.by = "ident", gene.set.use = rownames(scRep_example@assays$escape.ssGSEA@data)[1:12], @@ -330,7 +339,7 @@ heatmapEnrichment(scRep_example, We can also focus on individual gene sets - one approach is to use ```geyserEnrichment()```. Here individual cells are plotted along the Y-axis with graphical summary where the central dot refers to the median enrichment value and the thicker/thinner lines demonstrate the interval summaries referring to the 66% and 95%. -```{r} +```{r tidy=FALSE} geyserEnrichment(scRep_example, assay = "escape.ssGSEA", gene.set = "HALLMARK-INTERFERON-GAMMA-RESPONSE") @@ -338,7 +347,7 @@ geyserEnrichment(scRep_example, To show the additional parameters that appear in visualizations of individual enrichment gene sets - we can reorder the groups by the mean of the gene set using **order.by** = "mean". -```{r} +```{r tidy=FALSE} geyserEnrichment(scRep_example, assay = "escape.ssGSEA", gene.set = "HALLMARK-INTERFERON-GAMMA-RESPONSE", @@ -347,7 +356,7 @@ geyserEnrichment(scRep_example, What if we had 2 separate samples or groups within the data? Another parameter we can use is **facet.by** to allow for direct visualization of an additional variable. -```{r} +```{r tidy=FALSE} geyserEnrichment(scRep_example, assay = "escape.ssGSEA", gene.set = "HALLMARK-INTERFERON-GAMMA-RESPONSE", @@ -355,7 +364,7 @@ geyserEnrichment(scRep_example, ``` Lastly, we can select the way the color is applied to the plot using the **color.by** parameter. Here we can set it to the gene set of interest *"HALLMARK-INTERFERON-GAMMA-RESPONSE"*. -```{r} +```{r tidy=FALSE} geyserEnrichment(scRep_example, assay = "escape.ssGSEA", gene.set = "HALLMARK-INTERFERON-GAMMA-RESPONSE", @@ -366,14 +375,14 @@ geyserEnrichment(scRep_example, Similar to the ```geyserEnrichment()``` the ```ridgeEnrichment()``` can display the distribution of enrichment values across the selected gene set. The central line is at the median value for the respective grouping. -```{r} +```{r tidy=FALSE} ridgeEnrichment(scRep_example, assay = "escape.ssGSEA", gene.set = "HALLMARK-IL2-STAT5-SIGNALING") ``` We can get the relative position of individual cells along the x-axis using the **add.rug** parameter. -```{r} +```{r tidy=FALSE} ridgeEnrichment(scRep_example, assay = "escape.ssGSEA", gene.set = "HALLMARK-IL2-STAT5-SIGNALING", @@ -385,7 +394,7 @@ ridgeEnrichment(scRep_example, Another distribution visualization is a violin plot, which we separate and directly compare using a binary classification. Like ```ridgeEnrichment()```, this allows for greater use of categorical variables. For ```splitEnrichment()```, the output will be two halves of a violin plot based on the **split.by** parameter with a central boxplot with the relative distribution across all samples. Here, we split the samples by *Type* where **B** refers to the peripheral blood and **L** refers to lung. -```{r} +```{r tidy=FALSE} splitEnrichment(scRep_example, assay = "escape.ssGSEA", gene.set = "HALLMARK-IL2-STAT5-SIGNALING", @@ -404,9 +413,10 @@ splitEnrichment(scRep_example, * The gene set library from either of the 3 options in the first section of the vignette. -```{r} +```{r tidy=FALSE} densityEnrichment(scRep_example, gene.set.use = "HALLMARK-IL6-JAK-STAT3-SIGNALING", + group.by = "orig.ident", gene.sets = GS.hallmark) ``` @@ -415,7 +425,7 @@ densityEnrichment(scRep_example, It may be advantageous to look at the distribution of multiple gene sets - here we can use ```scatterEnrichment()``` for a 2 gene set comparison. The color values are based on the density of points determined by the number of neighbors, similar to the [Nebulosa R package](https://www.bioconductor.org/packages/release/bioc/html/Nebulosa.html). We just need to define which gene set to plot on the **x.axis** and which to plot on the **y.axis**. -```{r} +```{r tidy=FALSE} scatterEnrichment(scRep_example, assay = "escape.ssGSEA", x.axis = "HALLMARK-INTERFERON-GAMMA-RESPONSE", @@ -424,7 +434,7 @@ scatterEnrichment(scRep_example, The scatter plot can also be converted into a hexbin, another method for summarizing the individual cell distributions along the x and y axis, by setting **style** = "hex". -```{r} +```{r tidy=FALSE} scatterEnrichment(scRep_example, assay = "escape.ssGSEA", x.axis = "HALLMARK-INTERFERON-GAMMA-RESPONSE", @@ -442,15 +452,15 @@ escape has its own PCA function ```performPCA()``` which will work on a single-c Alternatively, other PCA-based functions like Seurat's ```RunPCA()``` or scater's ```runPCA()` can be used. These functions are likely faster and would be ideal if we have a larger number of cells and/or gene sets. -```{r} +```{r tidy=FALSE} scRep_example <- performPCA(scRep_example, - assay = "escape.ssGSEA", - n.dim = 1:10) + assay = "escape.ssGSEA", + n.dim = 1:10) ``` *escape* has a built in method for plotting PCA ```pcaEnrichment()``` that functions similarly to the ```scatterEnrichment()``` function where **x.axis** and **y.axis** are the components to plot. -```{r} +```{r tidy=FALSE} pcaEnrichment(scRep_example, dimRed = "escape.PCA", x.axis = "PC1", @@ -463,7 +473,7 @@ pcaEnrichment(scRep_example, **display.factors** will overlay the magnitude and direction that the features/gene sets contribute to the selected components. The number of gene sets is determined by **number.of.factors**. This can assist in understanding the underlying differences in enrichment across different cells. -```{r} +```{r tidy=FALSE} pcaEnrichment(scRep_example, dimRed = "escape.PCA", x.axis = "PC1", @@ -475,13 +485,13 @@ pcaEnrichment(scRep_example, ## Differential Enrichment -Differential enrichment analysis can be performed similar to differential gene expression analysis. For the purposes of finding the differential enrichment values, we can first normalize the enrichment values for the ssGSEA calculations. +Differential enrichment analysis can be performed similar to differential gene expression analysis. For the purposes of finding the differential enrichment values, we can first normalize the enrichment values for the ssGSEA calculations. Here we are using the **scale.factor** parameter of *nFeature_RNA*, which is the total feature space for each cell. -```{r} +```{r tidy=FALSE} scRep_example <- performNormalization(scRep_example, - assay = "escape.ssGSEA", - gene.sets = GS.hallmark, - make.positive = FALSE) + assay = "escape.ssGSEA", + gene.sets = GS.hallmark, + scale.factor = scRep_example$nFeature_RNA) all.markers <- FindAllMarkers(scRep_example, assay = "escape.ssGSEA_normalized", diff --git a/vignettes/articles/SC_Visualizations.Rmd b/vignettes/articles/SC_Visualizations.Rmd index b7468716..07fac896 100644 --- a/vignettes/articles/SC_Visualizations.Rmd +++ b/vignettes/articles/SC_Visualizations.Rmd @@ -43,7 +43,6 @@ data("contig_list") combined.TCR <- combineTCR(contig_list, samples = c("P17B", "P17L", "P18B", "P18L", "P19B","P19L", "P20B", "P20L")) -googledrive::drive_download("https://drive.google.com/file/d/1_YuRraDyg8UgF3oasjF0-jgPnwox-B24/view?usp=share_link", overwrite = TRUE) scRep_example <- readRDS("scRep_example_full.rds") diff --git a/vignettes/articles/Trex.Rmd b/vignettes/articles/Trex.Rmd index 6a166e43..1843c26f 100644 --- a/vignettes/articles/Trex.Rmd +++ b/vignettes/articles/Trex.Rmd @@ -43,7 +43,7 @@ suppressMessages(library(ggplot2)) suppressMessages(library(viridis)) suppressMessages(library(dplyr)) suppressMessages(library(reticulate)) -use_condaenv(condaenv = "r-reticulate", required = TRUE) +#use_condaenv(condaenv = "r-reticulate", required = TRUE) ``` # Getting Started