Skip to content

Commit

Permalink
Merge pull request #216 from stemangiola/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
stemangiola authored Oct 6, 2021
2 parents 820b0b8 + 14974e4 commit 065646e
Show file tree
Hide file tree
Showing 10 changed files with 475 additions and 28 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ Suggests:
functional,
survminer,
tidySummarizedExperiment,
markdown
markdown,
uwot
VignetteBuilder:
knitr
RdMacros:
Expand Down
126 changes: 125 additions & 1 deletion R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -1825,7 +1825,7 @@ we suggest to partition the dataset for sample clusters.

}

#' Get principal component information to a tibble using tSNE
#' Get tSNE
#'
#' @keywords internal
#' @noRd
Expand Down Expand Up @@ -1939,6 +1939,130 @@ get_reduced_dimensions_TSNE_bulk <-

}

#' Get UMAP
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom rlang :=
#' @importFrom stats setNames
#' @importFrom utils install.packages
#'
#' @param .data A tibble
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
#' @param top An integer. How many top genes to select
#' @param of_samples A boolean
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
#' @param ... Further parameters passed to the function uwot
#'
#' @return A tibble with additional columns
#'
get_reduced_dimensions_UMAP_bulk <-
function(.data,
.element = NULL,
.feature = NULL,

.abundance = NULL,
.dims = 2,
top = 500,
of_samples = TRUE,
log_transform = TRUE,
scale = TRUE,
calculate_for_pca_dimensions = 20,
...) {

if(!is.null(calculate_for_pca_dimensions) & (
!is(calculate_for_pca_dimensions, "numeric") |
length(calculate_for_pca_dimensions) > 1
))
stop("tidybulk says: the argument calculate_for_pca_dimensions should be NULL or an integer of size 1")

# Comply with CRAN NOTES
. = NULL

# Get column names
.element = enquo(.element)
.feature = enquo(.feature)
.abundance = enquo(.abundance)

# Evaluate ...
arguments <- list(...)
# if (!"check_duplicates" %in% names(arguments))
# arguments = arguments %>% c(check_duplicates = FALSE)
if (!"dims" %in% names(arguments))
arguments = arguments %>% c(n_components = .dims)
if (!"init" %in% names(arguments))
arguments = arguments %>% c(init = "spca")

# Check if package is installed, otherwise install
if (find.package("uwot", quiet = TRUE) %>% length %>% equals(0)) {
message("tidybulk says: Installing uwot")
install.packages("uwot", repos = "https://cloud.r-project.org")
}

df_source =
.data %>%

# Filter NA symbol
filter(!!.feature %>% is.na %>% not()) %>%

# Prepare data frame
distinct(!!.feature,!!.element,!!.abundance) %>%

# Check if data rectangular
when(
check_if_data_rectangular(., !!.element,!!.feature,!!.abundance) ~
eliminate_sparse_transcripts(., !!.feature),
~ (.)
) %>%

# Check if log transform is needed
when(log_transform ~ dplyr::mutate(., !!.abundance := !!.abundance %>% log1p), ~ (.)) %>%

# Filter most variable genes
keep_variable_transcripts(!!.element,!!.feature,!!.abundance, top)

# Calculate based on PCA
if(!is.null(calculate_for_pca_dimensions))
df_UMAP =
df_source %>%
reduce_dimensions(
!!.element,!!.feature,!!.abundance,
method="PCA",
.dims = calculate_for_pca_dimensions,
action="get",
scale = scale
) %>%
suppressMessages() %>%
as_matrix(rownames = quo_name(.element))

# Calculate based on all features
else
df_UMAP =
df_source %>%
spread(!!.feature,!!.abundance) %>%
as_matrix(rownames = quo_name(.element))

do.call(uwot::tumap, c(list(df_UMAP), arguments)) %>%
as_tibble(.name_repair = "minimal") %>%
setNames(c("UMAP1", "UMAP2")) %>%

# add element name
dplyr::mutate(!!.element := df_UMAP %>% rownames) %>%
select(!!.element, everything()) %>%

# Attach attributes
reattach_internals(.data) %>%
memorise_methods_used("uwot")

}

#' Get rotated dimensions of two principal components or MDS dimension of choice, of an angle
#'
#' @keywords internal
Expand Down
96 changes: 91 additions & 5 deletions R/functions_SE.R
Original file line number Diff line number Diff line change
Expand Up @@ -361,17 +361,16 @@ get_reduced_dimensions_TSNE_bulk_SE <-

# Set perprexity to not be too high
if (!"perplexity" %in% names(arguments))
arguments = arguments %>% c(perplexity = ((
.data %>% distinct(!!.element) %>% nrow %>% sum(-1)
) / 3 / 2) %>% floor() %>% min(30))
arguments = arguments %>% c(perplexity = ((
.data %>% ncol() %>% sum(-1)
) / 3 / 2) %>% floor() %>% min(30))

# If not enough samples stop
if (arguments$perplexity <= 2)
stop("tidybulk says: You don't have enough samples to run tSNE")

# Calculate the most variable genes, from plotMDS Limma
tsne_obj =
do.call(Rtsne::Rtsne, c(list(t(.data)), arguments))
tsne_obj = do.call(Rtsne::Rtsne, c(list(t(.data)), arguments))



Expand All @@ -389,6 +388,93 @@ get_reduced_dimensions_TSNE_bulk_SE <-

}

#' Get UMAP
#'
#' @keywords internal
#'
#' @import dplyr
#' @import tidyr
#' @import tibble
#' @importFrom rlang :=
#' @importFrom stats setNames
#' @importFrom utils install.packages
#'
#' @param .data A tibble
#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
#' @param top An integer. How many top genes to select
#' @param of_samples A boolean
#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
#' @param ... Further parameters passed to the function uwot
#'
#' @return A tibble with additional columns
#'
get_reduced_dimensions_UMAP_bulk_SE <-
function(.data,
.dims = 2,
top = 500,
of_samples = TRUE,
log_transform = TRUE,
scale = NULL, # This is only a dummy argument for making it compatibble with PCA
calculate_for_pca_dimensions = 20,
...) {
# Comply with CRAN NOTES
. = NULL

# To avoid dplyr complications

# Evaluate ...
arguments <- list(...)
# if (!"check_duplicates" %in% names(arguments))
# arguments = arguments %>% c(check_duplicates = FALSE)
if (!"dims" %in% names(arguments))
arguments = arguments %>% c(n_components = .dims)
if (!"init" %in% names(arguments))
arguments = arguments %>% c(init = "spca")


# Check if package is installed, otherwise install
if (find.package("uwot", quiet = TRUE) %>% length %>% equals(0)) {
message("tidybulk says: Installing uwot")
install.packages("uwot", repos = "https://cloud.r-project.org")
}


# Calculate based on PCA
if(!is.null(calculate_for_pca_dimensions))
df_UMAP =
.data %>%

t() %>%

# Calculate principal components
prcomp(scale = scale) %$%

# Parse the PCA results to a tibble
x %>%
.[,1:calculate_for_pca_dimensions]

# Calculate based on all features
else
df_UMAP = .data

umap_obj = do.call(uwot::tumap, c(list(df_UMAP), arguments))

list(
raw_result = umap_obj,
result = umap_obj %>%
as_tibble(.name_repair = "minimal") %>%
setNames(c("UMAP1", "UMAP2")) %>%

# add element name
dplyr::mutate(sample = !!.data %>% colnames) %>%
select(-sample)
)

}

counts_scaled_exist_SE = function(.data){

Expand Down
34 changes: 31 additions & 3 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,23 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements)
#' Underlying method for tSNE:
#' Rtsne::Rtsne(data, ...)
#'
#' Underlying method for UMAP:
#'
#' df_source =
#' .data %>%
#'
#' # Filter NA symbol
#' filter(!!.feature %>% is.na %>% not()) %>%
#'
#' # Prepare data frame
#' distinct(!!.feature,!!.element,!!.abundance) %>%
#'
#' # Filter most variable genes
#' keep_variable_transcripts(top) %>%
#' reduce_dimensions(method="PCA", .dims = calculate_for_pca_dimensions, action="get" ) %>%
#' as_matrix(rownames = quo_name(.element)) %>%
#' uwot::tumap(...)
#'
#'
#' @return A tbl object with additional columns for the reduced dimensions
#'
Expand Down Expand Up @@ -814,7 +831,7 @@ setGeneric("reduce_dimensions", function(.data,
) %>%

when(
method == "MDS" ~ get_reduced_dimensions_MDS_bulk(.,
tolower(method) == tolower("MDS") ~ get_reduced_dimensions_MDS_bulk(.,
.abundance = !!.abundance,
.dims = .dims,
.element = !!.element,
Expand All @@ -824,7 +841,7 @@ setGeneric("reduce_dimensions", function(.data,
log_transform = log_transform,
...
),
method == "PCA" ~ get_reduced_dimensions_PCA_bulk(.,
tolower(method) == tolower("PCA") ~ get_reduced_dimensions_PCA_bulk(.,
.abundance = !!.abundance,
.dims = .dims,
.element = !!.element,
Expand All @@ -835,7 +852,7 @@ setGeneric("reduce_dimensions", function(.data,
scale = scale,
...
),
method == "tSNE" ~ get_reduced_dimensions_TSNE_bulk(.,
tolower(method) == tolower("tSNE") ~ get_reduced_dimensions_TSNE_bulk(.,
.abundance = !!.abundance,
.dims = .dims,
.element = !!.element,
Expand All @@ -845,6 +862,17 @@ setGeneric("reduce_dimensions", function(.data,
log_transform = log_transform,
...
),
tolower(method) == tolower("UMAP") ~ get_reduced_dimensions_UMAP_bulk(.,
.abundance = !!.abundance,
.dims = .dims,
.element = !!.element,
.feature = !!.feature,
top = top,
of_samples = of_samples,
log_transform = log_transform,
scale = scale,
...
),
TRUE ~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\"")
)

Expand Down
18 changes: 10 additions & 8 deletions R/methods_SE.R
Original file line number Diff line number Diff line change
Expand Up @@ -350,10 +350,11 @@ setMethod("cluster_elements",
my_reduction_function =
method %>%
when(
(.) == "MDS" ~ get_reduced_dimensions_MDS_bulk_SE,
(.) == "PCA" ~ get_reduced_dimensions_PCA_bulk_SE,
(.) == "tSNE" ~ get_reduced_dimensions_TSNE_bulk_SE,
~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\"")
tolower(.) == tolower("MDS") ~ get_reduced_dimensions_MDS_bulk_SE,
tolower(.) == tolower("PCA") ~ get_reduced_dimensions_PCA_bulk_SE,
tolower(.) == tolower("tSNE") ~ get_reduced_dimensions_TSNE_bulk_SE,
tolower(.) == tolower("UMAP") ~ get_reduced_dimensions_UMAP_bulk_SE,
~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\", or \"UMAP\" ")
)

# Both dataframe and raw result object are returned
Expand All @@ -378,10 +379,11 @@ setMethod("cluster_elements",

# Add bibliography
when(
method == "MDS" ~ memorise_methods_used(., "limma"),
method == "PCA" ~ memorise_methods_used(., "stats"),
method == "tSNE" ~ memorise_methods_used(., "rtsne"),
~ stop("tidybulk says: the only supported methods are \"kmeans\" or \"SNN\" ")
tolower(method) == tolower("MDS") ~ memorise_methods_used(., "limma"),
tolower(method) == tolower("PCA") ~ memorise_methods_used(., "stats"),
tolower(method) == tolower("tSNE") ~ memorise_methods_used(., "rtsne"),
tolower(method) == tolower("UMAP") ~ memorise_methods_used(., "uwot"),
~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\", or \"UMAP\" ")
) %>%

# Attach edgeR for keep variable filtering
Expand Down
Loading

0 comments on commit 065646e

Please sign in to comment.