Merge pull request #216 from stemangiola/dev

Dev
stemangiola · Oct 6, 2021 · 065646e · 065646e
2 parents 820b0b8 + 14974e4
commit 065646e
Show file tree

Hide file tree

Showing 10 changed files with 475 additions and 28 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -70,7 +70,8 @@ Suggests:
     functional,
     survminer,
     tidySummarizedExperiment,
-    markdown
+    markdown,
+    uwot
 VignetteBuilder: 
     knitr
 RdMacros:

diff --git a/R/functions.R b/R/functions.R
@@ -1825,7 +1825,7 @@ we suggest to partition the dataset for sample clusters.
 
 	}
 
-#' Get principal component information to a tibble using tSNE
+#' Get tSNE
 #'
 #' @keywords internal
 #' @noRd
@@ -1939,6 +1939,130 @@ get_reduced_dimensions_TSNE_bulk <-
 
 	}
 
+#' Get UMAP
+#'
+#' @keywords internal
+#'
+#' @import dplyr
+#' @import tidyr
+#' @import tibble
+#' @importFrom rlang :=
+#' @importFrom stats setNames
+#' @importFrom utils install.packages
+#'
+#' @param .data A tibble
+#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
+#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
+#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
+#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
+#' @param top An integer. How many top genes to select
+#' @param of_samples A boolean
+#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
+#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
+#' @param ... Further parameters passed to the function uwot
+#'
+#' @return A tibble with additional columns
+#'
+get_reduced_dimensions_UMAP_bulk <-
+  function(.data,
+           .element = NULL,
+           .feature = NULL,
+
+           .abundance = NULL,
+           .dims = 2,
+           top = 500,
+           of_samples = TRUE,
+           log_transform = TRUE,
+           scale = TRUE,
+           calculate_for_pca_dimensions = 20,
+           ...) {
+
+    if(!is.null(calculate_for_pca_dimensions) & (
+      !is(calculate_for_pca_dimensions, "numeric") |
+      length(calculate_for_pca_dimensions) > 1
+    ))
+      stop("tidybulk says: the argument calculate_for_pca_dimensions should be NULL or an integer of size 1")
+
+    # Comply with CRAN NOTES
+    . = NULL
+
+    # Get column names
+    .element = enquo(.element)
+    .feature = enquo(.feature)
+    .abundance = enquo(.abundance)
+
+    # Evaluate ...
+    arguments <- list(...)
+    # if (!"check_duplicates" %in% names(arguments))
+    #   arguments = arguments %>% c(check_duplicates = FALSE)
+    if (!"dims" %in% names(arguments))
+      arguments = arguments %>% c(n_components = .dims)
+    if (!"init" %in% names(arguments))
+      arguments = arguments %>% c(init = "spca")
+
+    # Check if package is installed, otherwise install
+    if (find.package("uwot", quiet = TRUE) %>% length %>% equals(0)) {
+      message("tidybulk says: Installing uwot")
+      install.packages("uwot", repos = "https://cloud.r-project.org")
+    }
+
+    df_source =
+      .data %>%
+
+      # Filter NA symbol
+      filter(!!.feature %>% is.na %>% not()) %>%
+
+      # Prepare data frame
+      distinct(!!.feature,!!.element,!!.abundance) %>%
+
+      # Check if data rectangular
+      when(
+        check_if_data_rectangular(., !!.element,!!.feature,!!.abundance)  ~
+          eliminate_sparse_transcripts(., !!.feature),
+        ~ (.)
+      ) %>%
+
+      # Check if log transform is needed
+      when(log_transform    ~ dplyr::mutate(., !!.abundance := !!.abundance %>% log1p), ~ (.)) %>%
+
+      # Filter most variable genes
+      keep_variable_transcripts(!!.element,!!.feature,!!.abundance, top)
+
+    # Calculate based on PCA
+    if(!is.null(calculate_for_pca_dimensions))
+      df_UMAP =
+      df_source %>%
+      reduce_dimensions(
+        !!.element,!!.feature,!!.abundance,
+        method="PCA",
+        .dims = calculate_for_pca_dimensions,
+        action="get",
+        scale = scale
+      ) %>%
+      suppressMessages() %>%
+      as_matrix(rownames = quo_name(.element))
+
+    # Calculate based on all features
+    else
+      df_UMAP =
+      df_source %>%
+      spread(!!.feature,!!.abundance) %>%
+      as_matrix(rownames = quo_name(.element))
+
+    do.call(uwot::tumap, c(list(df_UMAP), arguments)) %>%
+      as_tibble(.name_repair = "minimal") %>%
+      setNames(c("UMAP1", "UMAP2")) %>%
+
+      # add element name
+      dplyr::mutate(!!.element := df_UMAP %>% rownames) %>%
+      select(!!.element, everything()) %>%
+
+      # Attach attributes
+      reattach_internals(.data) %>%
+      memorise_methods_used("uwot")
+
+  }
+
 #' Get rotated dimensions of two principal components or MDS dimension of choice, of an angle
 #'
 #' @keywords internal

diff --git a/R/functions_SE.R b/R/functions_SE.R
@@ -361,17 +361,16 @@ get_reduced_dimensions_TSNE_bulk_SE <-
 
 		# Set perprexity to not be too high
 		if (!"perplexity" %in% names(arguments))
-			arguments = arguments %>% c(perplexity = ((
-				.data %>% distinct(!!.element) %>% nrow %>% sum(-1)
-			) / 3 / 2) %>% floor() %>% min(30))
+		  arguments = arguments %>% c(perplexity = ((
+		    .data %>% ncol() %>% sum(-1)
+		  ) / 3 / 2) %>% floor() %>% min(30))
 
 		# If not enough samples stop
 		if (arguments$perplexity <= 2)
 			stop("tidybulk says: You don't have enough samples to run tSNE")
 
 		# Calculate the most variable genes, from plotMDS Limma
-		tsne_obj =
-			do.call(Rtsne::Rtsne, c(list(t(.data)), arguments))
+		tsne_obj = do.call(Rtsne::Rtsne, c(list(t(.data)), arguments))
 
 
 
@@ -389,6 +388,93 @@ get_reduced_dimensions_TSNE_bulk_SE <-
 
 	}
 
+#' Get UMAP
+#'
+#' @keywords internal
+#'
+#' @import dplyr
+#' @import tidyr
+#' @import tibble
+#' @importFrom rlang :=
+#' @importFrom stats setNames
+#' @importFrom utils install.packages
+#'
+#' @param .data A tibble
+#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`)
+#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6)
+#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes)
+#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples)
+#' @param top An integer. How many top genes to select
+#' @param of_samples A boolean
+#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
+#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
+#' @param ... Further parameters passed to the function uwot
+#'
+#' @return A tibble with additional columns
+#'
+get_reduced_dimensions_UMAP_bulk_SE <-
+  function(.data,
+           .dims = 2,
+           top = 500,
+           of_samples = TRUE,
+           log_transform = TRUE,
+           scale = NULL, # This is only a dummy argument for making it compatibble with PCA
+           calculate_for_pca_dimensions = 20,
+           ...) {
+    # Comply with CRAN NOTES
+    . = NULL
+
+    # To avoid dplyr complications
+
+    # Evaluate ...
+    arguments <- list(...)
+    # if (!"check_duplicates" %in% names(arguments))
+    #   arguments = arguments %>% c(check_duplicates = FALSE)
+    if (!"dims" %in% names(arguments))
+      arguments = arguments %>% c(n_components = .dims)
+    if (!"init" %in% names(arguments))
+      arguments = arguments %>% c(init = "spca")
+
+
+    # Check if package is installed, otherwise install
+    if (find.package("uwot", quiet = TRUE) %>% length %>% equals(0)) {
+      message("tidybulk says: Installing uwot")
+      install.packages("uwot", repos = "https://cloud.r-project.org")
+    }
+
+
+    # Calculate based on PCA
+    if(!is.null(calculate_for_pca_dimensions))
+      df_UMAP =
+      .data %>%
+
+      t() %>%
+
+      # Calculate principal components
+      prcomp(scale = scale) %$%
+
+      # Parse the PCA results to a tibble
+      x %>%
+      .[,1:calculate_for_pca_dimensions]
+
+    # Calculate based on all features
+    else
+      df_UMAP = .data
+
+    umap_obj = do.call(uwot::tumap, c(list(df_UMAP), arguments))
+
+    list(
+      raw_result = umap_obj,
+      result = umap_obj  %>%
+        as_tibble(.name_repair = "minimal") %>%
+        setNames(c("UMAP1", "UMAP2")) %>%
+
+        # add element name
+        dplyr::mutate(sample = !!.data %>% colnames) %>%
+        select(-sample)
+    )
+
+  }
 
 counts_scaled_exist_SE = function(.data){
 

diff --git a/R/methods.R b/R/methods.R
@@ -725,6 +725,23 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements)
 #' Underlying method for tSNE:
 #' Rtsne::Rtsne(data, ...)
 #'
+#' Underlying method for UMAP:
+#'
+#'  df_source =
+#' .data %>%
+#'
+#'   # Filter NA symbol
+#'   filter(!!.feature %>% is.na %>% not()) %>%
+#'
+#'   # Prepare data frame
+#'   distinct(!!.feature,!!.element,!!.abundance) %>%
+#'
+#'   # Filter most variable genes
+#'   keep_variable_transcripts(top) %>%
+#'   reduce_dimensions(method="PCA",  .dims = calculate_for_pca_dimensions,  action="get" ) %>%
+#'   as_matrix(rownames = quo_name(.element)) %>%
+#'   uwot::tumap(...)
+#'
 #'
 #' @return A tbl object with additional columns for the reduced dimensions
 #'
@@ -814,7 +831,7 @@ setGeneric("reduce_dimensions", function(.data,
 		) %>%
 
 		when(
-			method == "MDS" ~ 	get_reduced_dimensions_MDS_bulk(.,
+			tolower(method) == tolower("MDS") ~ 	get_reduced_dimensions_MDS_bulk(.,
 				.abundance = !!.abundance,
 				.dims = .dims,
 				.element = !!.element,
@@ -824,7 +841,7 @@ setGeneric("reduce_dimensions", function(.data,
 				log_transform = log_transform,
 				...
 			),
-			method == "PCA" ~ 	get_reduced_dimensions_PCA_bulk(.,
+			tolower(method) == tolower("PCA") ~ 	get_reduced_dimensions_PCA_bulk(.,
 				.abundance = !!.abundance,
 				.dims = .dims,
 				.element = !!.element,
@@ -835,7 +852,7 @@ setGeneric("reduce_dimensions", function(.data,
 				scale = scale,
 				...
 			),
-			method == "tSNE" ~ 	get_reduced_dimensions_TSNE_bulk(.,
+			tolower(method) == tolower("tSNE") ~ 	get_reduced_dimensions_TSNE_bulk(.,
 				.abundance = !!.abundance,
 				.dims = .dims,
 				.element = !!.element,
@@ -845,6 +862,17 @@ setGeneric("reduce_dimensions", function(.data,
 				log_transform = log_transform,
 				...
 			),
+			tolower(method) == tolower("UMAP") ~ 	get_reduced_dimensions_UMAP_bulk(.,
+			                                                                       .abundance = !!.abundance,
+			                                                                       .dims = .dims,
+			                                                                       .element = !!.element,
+			                                                                       .feature = !!.feature,
+			                                                                       top = top,
+			                                                                       of_samples = of_samples,
+			                                                                       log_transform = log_transform,
+			                                                                       scale = scale,
+			                                                                       ...
+			),
 			TRUE ~ 	stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\"")
 		)
 

diff --git a/R/methods_SE.R b/R/methods_SE.R
@@ -350,10 +350,11 @@ setMethod("cluster_elements",
 	my_reduction_function  =
 		method %>%
 		when(
-			(.) == "MDS" ~ get_reduced_dimensions_MDS_bulk_SE,
-			(.) == "PCA" ~ get_reduced_dimensions_PCA_bulk_SE,
-			(.) == "tSNE" ~ get_reduced_dimensions_TSNE_bulk_SE,
-			~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\"")
+			tolower(.) == tolower("MDS") ~ get_reduced_dimensions_MDS_bulk_SE,
+			tolower(.) == tolower("PCA") ~ get_reduced_dimensions_PCA_bulk_SE,
+			tolower(.) == tolower("tSNE") ~ get_reduced_dimensions_TSNE_bulk_SE,
+			tolower(.) == tolower("UMAP") ~ get_reduced_dimensions_UMAP_bulk_SE,
+			~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\", or \"UMAP\" ")
 		)
 
 	# Both dataframe and raw result object are returned
@@ -378,10 +379,11 @@ setMethod("cluster_elements",
 
 		# Add bibliography
 		when(
-			method == "MDS" ~ memorise_methods_used(., "limma"),
-			method == "PCA" ~ memorise_methods_used(., "stats"),
-			method == "tSNE" ~ memorise_methods_used(., "rtsne"),
-			~ stop("tidybulk says: the only supported methods are \"kmeans\" or \"SNN\" ")
+		  tolower(method) == tolower("MDS") ~ memorise_methods_used(., "limma"),
+			tolower(method) == tolower("PCA") ~ memorise_methods_used(., "stats"),
+			tolower(method) == tolower("tSNE") ~ memorise_methods_used(., "rtsne"),
+			tolower(method) == tolower("UMAP") ~ memorise_methods_used(., "uwot"),
+			~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\", or \"UMAP\" ")
 		) %>%
 
 		# Attach edgeR for keep variable filtering