Skip to content

Commit

Permalink
Merge pull request #12 from AlexsLemonade/jashapiro/2-gene-symbols
Browse files Browse the repository at this point in the history
Create functions for converting ensembl ids to gene symbols
  • Loading branch information
jashapiro authored Nov 21, 2024
2 parents 9c5472c + 85d043e commit 758edb4
Show file tree
Hide file tree
Showing 14 changed files with 266 additions and 17 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
^.*\.Rproj$
^\.Rproj\.user$
^\.github$
^\.lintr$
^\.pre-commit-config.yaml$
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Imports:
methods,
pdfCluster,
purrr,
S4Vectors,
SingleCellExperiment,
tibble,
tidyr
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@ export(calculate_clusters)
export(calculate_purity)
export(calculate_silhouette)
export(calculate_stability)
export(ensembl_to_symbol)
export(extract_pc_matrix)
export(sce_to_symbols)
export(sweep_clusters)
import(SingleCellExperiment)
import(methods)
importFrom(S4Vectors,`metadata<-`)
importFrom(S4Vectors,metadata)
importFrom(stats,setNames)
10 changes: 5 additions & 5 deletions R/calculate-clusters.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
#' Specifically, the clustering algorithm defaults to "louvain" and the weighting scheme to "jaccard"
#' to align with common practice in scRNA-seq analysis.
#'
#' @import methods
#'
#' @param x An object containing PCs that clustering can be performed in. This can be either a SingleCellExperiment
#' object, a Seurat object, or a matrix where columns are PCs and rows are cells.
#' If a matrix is provided, it must have row names of cell ids (e.g., barcodes).
Expand All @@ -33,6 +31,8 @@
#' Louvain and Leiden clustering will also include `resolution`,
#' and Leiden clustering will further include `objective_function`.
#'
#' @import methods
#'
#' @export
#'
#' @examples
Expand Down Expand Up @@ -160,16 +160,16 @@ calculate_clusters <- function(
#' this function will use "PCA" for SingleCellExperiment objects, and
#' "pca" for Seurat objects.
#'
#' @import SingleCellExperiment
#' @import methods
#'
#' @param sc_object Either a SingleCellExperiment or Seurat object
#' @param pc_name Optionally, the name of the PC matrix in the object. If this is
#' not provided, the name "PCA" is used for SingleCellExperiment objects, and
#' "pca" for Seurat objects.
#'
#' @return PC matrix with row names
#'
#' @import methods
#' @import SingleCellExperiment
#'
#' @export
#'
#' @examples
Expand Down
127 changes: 127 additions & 0 deletions R/convert-gene-ids.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#' Convert Ensembl gene ids to gene symbols based on an ScPCA SingleCellExperiment object
#'
#' The SingleCellExperiment objects produced as part of ScPCA are indexed by
#' Ensembl gene ids, as those are more stable than gene symbols. However,
#' for many applications gene symbols are useful. This function provides a
#' simple and consistent conversion of Ensembl gene ids to gene symbols based on
#' the `gene_symbol` column that is present in the row data of ScPCA
#' SingleCellExperiment objects.
#'
#' For this function, the SingleCellExperiment object must contain a `gene_ids`
#' column containing Ensembl gene ids and a `gene_symbol` column containing gene
#' symbols. If any gene ids are not found or if the gene symbol is not defined,
#' the input gene id is returned, unless the `leave_na` is set to `TRUE`.
#'
#'
#' @param ensembl_ids A character vector of Ensembl gene ids to translate to
#' gene symbols.
#' @param sce A SingleCellExperiment object containing gene ids and gene symbols
#' to use for translation.
#' @param leave_na logical indicating whether to leave NA values in the output.
#' Default is `FALSE`
#'
#' @return A vector of gene symbols corresponding to the input Ensembl ids.
#' @export
#'
#' @import SingleCellExperiment
#'
#' @examples
#' \dontrun{
#' # convert a set of Ensembl ids to gene symbols
#' # using a SingleCellExperiment reference
#' ensembl_ids <- c("ENSG00000141510", "ENSG00000134323")
#' gene_symbols <- ensembl_to_symbol(ensembl_ids, sce)
#' gene_symbols
#' ### [1] "TP53" "MYCN"
#' }
ensembl_to_symbol <- function(ensembl_ids, sce, leave_na = FALSE) {
stopifnot(
"`sce` must be a SingleCellExperiment object ." = is(sce, "SingleCellExperiment"),
"`ensembl_ids` must be a character vector." = is.character(ensembl_ids),
"`sce` must contain both a `gene_ids` and `gene_symbol` column in the row data." =
all(c("gene_ids", "gene_symbol") %in% names(rowData(sce))),
"`leave_na` must be TRUE or FALSE." = is.logical(leave_na)
)

id_match <- match(ensembl_ids, rowData(sce)$gene_ids)
gene_symbols <- rowData(sce)[id_match, "gene_symbol"]

missing_symbols <- is.na(gene_symbols)
if (!leave_na && any(missing_symbols)) {
warning("Not all `ensembl_ids` values have corresponding gene symbols, using input ids for missing values.")
gene_symbols[missing_symbols] <- ensembl_ids[missing_symbols]
}

return(gene_symbols)
}

#' Set the row names of an ScPCA SingleCellExperiment object to gene symbols
#'
#' The SingleCellExperiment objects produced as part of ScPCA are indexed by
#' Ensembl gene ids, as those are more stable than gene symbols. However,
#' for many applications gene symbols are useful. This function converts the
#' row names (indexes) of a SingleCellExperiment object to gene symbols based on the
#' `gene_symbol` column that is present in the row data of ScPCA SingleCellExperiment objects.
#'
#' Internal data structures such as the list of highly variable genes and the
#' rotation matrix for the PCA are also updated to use gene symbols, if present
#' (and not disabled by the `convert_hvg` and `convert_pca` arguments).
#'
#' Note that using this function will result in non-unique row ids as no
#' de-duplication is currently performed.
#'
#' @param sce A SingleCellExperiment object containing gene ids and gene symbols.
#' @param convert_hvg Logical indicating whether to convert highly variable genes to gene symbols.
#' @param convert_pca Logical indicating whether to convert PCA rotation matrix to gene symbols.
#'
#' @return A SingleCellExperiment object with row names set as gene symbols.
#' @export
#'
#' @import SingleCellExperiment
#' @importFrom S4Vectors metadata `metadata<-`
#'
#' @examples
#' \dontrun{
#' # convert a SingleCellExperiment object to use gene symbols
#' symbol_sce <- sce_to_symbols(sce)
#' }
sce_to_symbols <- function(sce, convert_hvg = TRUE, convert_pca = TRUE) {
stopifnot(
"`sce` must be a SingleCellExperiment object." = is(sce, "SingleCellExperiment"),
"`sce` must contain both a `gene_ids` and `gene_symbol` column in the row data." =
all(c("gene_ids", "gene_symbol") %in% names(rowData(sce)))
)
row_ids <- rowData(sce)$gene_symbol
# set Ensembl ids as original ids for later translations
names(row_ids) <- rowData(sce)$gene_ids

missing_ids <- is.na(row_ids)
if (any(missing_ids)) {
warning("Not all rows have gene symbols, using Ensembl ids for missing values.")
row_ids[missing_ids] <- names(row_ids)[missing_ids]
}

rownames(sce) <- row_ids

if (convert_hvg && "highly_variable_genes" %in% names(metadata(sce))) {
hvgs <- metadata(sce)$highly_variable_genes
if (all(hvgs %in% names(row_ids))) {
metadata(sce)$highly_variable_genes <- row_ids[hvgs]
} else {
warning("Highly variable gene names did not match `gene_ids` values, not updating highly variable genes.")
}
}

if (convert_pca && "PCA" %in% reducedDimNames(sce) && !is.null(attr(reducedDim(sce, "PCA"), "rotation"))) {
pca <- reducedDim(sce, "PCA")
rotation_ids <- rownames(attr(pca, "rotation"))
if (all(rotation_ids %in% names(row_ids))) {
rownames(attr(pca, "rotation")) <- row_ids[rotation_ids]
reducedDim(sce, "PCA") <- pca
} else {
warning("PCA rotation matrix names did not match `gene_ids` values, not updating.")
}
}

return(sce)
}
6 changes: 3 additions & 3 deletions R/evaluate-clusters.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,9 @@ calculate_purity <- function(
#' When assessing stability, you should specify the same clustering parameters here as
#' were used to calculate the original clusters.
#'
#' Note that this function will also make use of bluster::clusterRows() with the
#' bluster::NNGraphParam() function on a principal components matrix. Note that defaults
#' for some arguments may differ from the bluster::NNGraphParam() defaults.
#' Note that this function will also make use of `bluster::clusterRows()` with the
#' `bluster::NNGraphParam()` function on a principal components matrix. Note that defaults
#' for some arguments may differ from the `bluster::NNGraphParam()` defaults.
#' Specifically, the clustering algorithm defaults to "louvain" and the weighting scheme
#' to "jaccard" to align with common practice in scRNA-seq analysis.
#'
Expand Down
6 changes: 3 additions & 3 deletions man/calculate_stability.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 45 additions & 0 deletions man/ensembl_to_symbol.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions man/sce_to_symbols.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added tests/testthat/data/scpca_sce.rds
Binary file not shown.
2 changes: 0 additions & 2 deletions tests/testthat/test-calculate-clusters.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
suppressPackageStartupMessages(library(SingleCellExperiment))

set.seed(2024)
sce <- splatter::simpleSimulate(nGenes = 1000, verbose = FALSE) |>
scater::logNormCounts() |>
Expand Down
37 changes: 37 additions & 0 deletions tests/testthat/test-convert-gene-ids.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# read in SCE for testing
sce <- readRDS(test_path("data", "scpca_sce.rds"))

test_that("basic ensembl_id conversion works", {
ensembl_ids <- c("ENSG00000141510", "ENSG00000134323")
gene_symbols <- ensembl_to_symbol(ensembl_ids, sce)

expect_equal(gene_symbols, c("TP53", "MYCN"))
})

test_that("ensembl_id conversion works with unexpected ids", {
ensembl_ids <- c("ENSG00000141510", "ENSG00000134323", "foobar")

expect_warning(gene_symbols <- ensembl_to_symbol(ensembl_ids, sce))
expect_equal(gene_symbols, c("TP53", "MYCN", "foobar"))

expect_no_warning(gene_symbols_na <- ensembl_to_symbol(ensembl_ids, sce, leave_na = TRUE))
expect_equal(gene_symbols_na, c("TP53", "MYCN", NA))
})

test_that("conversion of a full sce object works as expected", {
expect_warning(converted_sce <- sce_to_symbols(sce))

gene_symbols <- rowData(sce)$gene_symbol
names(gene_symbols) <- rowData(sce)$gene_ids
gene_symbols[is.na(gene_symbols)] <- names(gene_symbols)[is.na(gene_symbols)]

expect_equal(rownames(converted_sce), unname(gene_symbols))

# check that hvg and PCA were converted too.
expected_hvg <- gene_symbols[metadata(sce)$highly_variable_genes]
expect_equal(metadata(converted_sce)$highly_variable_genes, expected_hvg)

rotation_ids <- rownames(attr(reducedDim(converted_sce, "PCA"), "rotation"))
expected_rotation_ids <- gene_symbols[rownames(attr(reducedDim(sce, "PCA"), "rotation"))]
expect_equal(rotation_ids, expected_rotation_ids)
})
2 changes: 0 additions & 2 deletions tests/testthat/test-evaluate-clusters.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
suppressPackageStartupMessages(library(SingleCellExperiment))

set.seed(2024)
sce <- splatter::simpleSimulate(nGenes = 1000, verbose = FALSE) |>
scater::logNormCounts() |>
Expand Down
2 changes: 0 additions & 2 deletions tests/testthat/test-sweep-clusters.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
suppressPackageStartupMessages(library(SingleCellExperiment))

set.seed(2024)
sce <- splatter::simpleSimulate(nGenes = 1000, verbose = FALSE) |>
scater::logNormCounts() |>
Expand Down

0 comments on commit 758edb4

Please sign in to comment.