Initial upload day 1

saezlab · Nov 27, 2024 · 05c60c6 · 05c60c6
commit 05c60c6
Show file tree

Hide file tree

Showing 9 changed files with 4,770 additions and 0 deletions.
diff --git a/1_MedInfNetworks2024_Omics.Rmd b/1_MedInfNetworks2024_Omics.Rmd
diff --git a/1_MedInfNetworks2024_Omics.html b/1_MedInfNetworks2024_Omics.html
diff --git a/2_MedInfNetworks2024_Vis.Rmd b/2_MedInfNetworks2024_Vis.Rmd
@@ -0,0 +1,383 @@
+---
+title: "Data visualization with ggplot and pheatmap"
+output:
+  html_document:
+    toc: yes
+    toc_float: yes
+editor_options: 
+  markdown: 
+    wrap: 80
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Overview
+
+Get acquainted with data visualization with ggplot and pheatmap.
+
+You can find a useful cheatsheet for `ggplot2` and other common R packages in
+the RStudio website: <https://www.rstudio.com/resources/cheatsheets/>
+
+## 1) Loading packages and data
+
+Please install the following packages from CRAN:
+
+-   ggplot2
+
+-   pheatmap
+
+-   dplyr
+
+-   tidyr
+
+-   stringr
+
+-   uwot
+### Loading necessary packages
+
+```{r packages, message=F}
+# plotting
+library(ggplot2)
+library(pheatmap)
+
+# manipulation of data frames
+library(dplyr)
+library(tidyr)
+
+# manipulation of strings
+library(stringr)
+
+# for UMAP transformation
+library(uwot)
+```
+
+### Load the data
+
+Load the expression and annotation data you saved during your previous
+exercise.If you don't have them in your current working directory you can find
+them in the data folder of the course repository.
+
+```{r load, results=F, message=F}
+
+data <- readRDS("data/lapatinib_expression.rds")
+meta <- readRDS("data/lapatinib_phenotype.rds")
+```
+
+## 4) Generating barplot
+
+Let's create a barplot to see how many samples we have per tissue. First we will
+count how many samples per tissue we have in our data set and generate a list of
+colors for each one.
+
+```{r barplot_tissue, results=F, message=F}
+# Remove special characters from metadata column names
+colnames(meta) <- str_replace_all(colnames(meta), ":", ".")
+
+# Create summary table
+tissue.sum <- data.frame(table(meta$tissue.ch1))
+
+# Create ggplot item and map the axes
+ggp <- ggplot(tissue.sum, aes(x = Var1, y = Freq))
+
+# add a bar plot
+ggp + geom_col()
+
+# Change x and y labels
+ggp + geom_col() +
+  xlab("Tissue type") +
+  ylab("Frequency")
+
+# Add colors
+
+# Recreate ggplot item
+ggp <- ggplot(tissue.sum, aes(x = Var1, y = Freq, fill = Var1))
+
+ggp + geom_col() +
+  xlab("Tissue type") +
+  ylab("Frequency")
+
+# Change data order -- working with factors
+
+# Look at current factor
+tissue.sum$Var1
+
+# What happens if we sort the table and check the factor again
+tissue.sum[order(tissue.sum$Freq, decreasing = FALSE), ]$Var1
+
+# Create correct levels order
+new.levels <- as.character(tissue.sum[order(tissue.sum$Freq, decreasing = FALSE), ]$Var1)
+
+tissue.sum$Var1 <- factor(tissue.sum$Var1, new.levels)
+
+# Recreate ggplot item
+ggp <- ggplot(tissue.sum, aes(x = Var1, y = Freq, fill = Var1))
+
+ggp + geom_col() +
+  xlab("Tissue type") +
+  ylab("Frequency")
+
+########
+# TASK #
+########
+
+## Replot with the tissue types in decreasing order (largest first)
+```
+
+#Summarising data with group_by
+
+The pipeline `%>%` operator passes the object/variable/output from its left side
+as a first argument to the function on its right side
+
+```{r groups, results=F, message=F}
+
+# Select specific columns
+meta.drugcount <- meta %>% subset(select = c(tissue.ch1, drug_conc)) %>%
+  # Group by columns of interest
+  group_by(tissue.ch1, drug_conc) %>%
+  # Count number of occurences of each drug concentration by evaluating the length of the group
+  mutate(count = length(drug_conc)) %>%
+  # Remove duplicate rows
+  unique()
+
+########
+# TASK #
+########
+
+## Create meta.drugtime using the above technique and the drug_tpoint column
+```
+
+## 5) Basic statistics
+
+Let us first compute some summary statistics and plot them across samples in
+order to get a preliminary idea of our expression data.
+
+```{r descriptive_stats, results=F, message=F}
+
+# Calculate key stats of expression data
+c_mean <- colMeans(data)
+c_median <- apply(data, 2, median)
+c_std <- apply(data, 2, sd)
+
+# Create numeric values of samples (equivalent to sample names)
+rng <- 1:length(c_mean)
+
+# Put into dataframe
+sumstats <- data.frame(
+  c_mean = c_mean,
+  c_median = c_median,
+  c_std = c_std,
+  rng = rng
+)
+
+# Create geom_point plot using x as a range of sample "numbers" and y as log2(expr), adding both median and mean values
+statplot <- ggplot(sumstats, aes(x = rng, y = c_median)) +
+  # Add points for median (setting legend label to median)
+  geom_point(aes(color = "median")) +
+  # Add points for mean
+  geom_point(aes(x = rng, y = c_mean, color = "mean")) +
+  # Set y range from lower to upper std deviations
+  ylim(range(c(c_mean - c_std, c_mean + c_std))) +
+  # Change y axis label
+  ylab("log2(expr)") +
+  # Change x axis label
+  xlab("Samples")
+
+statplot
+
+# Adding error bars
+statplot +
+  geom_errorbar(aes(ymin = c_mean - c_std, ymax = c_mean + c_std), width = .2)
+```
+
+## 6) Boxplots
+
+Let's see now how this data looks using a box plot, coloring by tissue:
+
+```{r distribution2, results=F, message=F}
+
+# Box plots usually calculate their own summary statistics, so they need to be given the entire sample matrix
+# However we first need to put that into long format
+long.data <- data %>% pivot_longer(everything())
+
+# Select first four samples only (otherwise boxplot would be unreadable)
+smps <- meta$title[1:4]
+
+short <- long.data[long.data$name %in% smps, ]
+
+# Create boxplot
+ggplot(short, aes(x = name, y = value)) +
+  geom_boxplot()
+
+# Merge summary stats with meta data
+merged.meta <- merge(long.data, meta, by.x = "name", by.y = "title", all.x = TRUE)
+
+# Create boxplot
+ggplot(merged.meta, aes(x = tissue.ch1, y = value)) +
+  geom_boxplot()
+
+########
+# TASK #
+########
+
+## Create box plot showing expression statistics per drug concentration
+
+# Did it work? are you sure?
+```
+
+## 7) Histogram + density
+
+Now we select a sample and see how the expression data distribution looks like.
+To do this we can plot the histogram or the density plot (or both).
+
+```{r distrib, results=F, message=F}
+
+# Set sample that we want to look at
+sample_name <- "A498_lapatinib_10000nM_24h"
+
+# Select that sample from our data
+data.smp <- long.data[long.data$name == sample_name, ]
+
+# Create histogram of expression values with density
+ggplot(data.smp, aes(x = value)) +
+  geom_histogram(aes(y = ..density..), color = "black", fill = "grey", bins = 50) +
+  geom_density(alpha = .2, fill = "#FF6666")
+
+# Now lets do this for 4 samples and show it in a facet_wrap
+
+# Select that sample from our data (using smps variable we created before)
+data.smp <- long.data[long.data$name %in% smps, ]
+
+# Create histogram of expression values with density
+ggplot(data.smp, aes(x = value)) +
+  geom_histogram(aes(y = ..density..), color = "black", fill = "grey", bins = 50) +
+  geom_density(alpha = .2, fill = "#FF6666") +
+  facet_wrap(~name)
+```
+
+### Finished with our quality/summary stats, lets clean our environment
+
+```{r clean, results=F, message=F}
+
+rm(list = setdiff(ls(), c("data", "meta")))
+```
+
+## 8) Heatmap + hierarchical clustering
+
+Now we will plot the heatmap of our expression profiles along with the
+hierarchical clustering. On top of that, we will show the tissue of origin with
+colors (which can nicely be done in a single function in base R):
+
+```{r hmap, results=F, message=F}
+
+## Create an annotation df
+
+# Create patient name column (sapply: apply over list, '[': get element, 1: first element)
+meta$patient <- sapply(strsplit(meta$title, "_"), `[`, 1)
+
+anno <- meta %>% subset(select = c(title, patient, tissue.ch1))
+
+row.names(anno) <- anno$title
+
+anno <- anno[-1]
+
+# Create short version for quick plotting
+data.short <- data[1:500, ]
+
+# Plot data
+pheatmap(as.matrix(data.short), annotation_col = anno, show_rownames = FALSE, show_colnames = FALSE)
+```
+
+## 9) Scatter plot and correlation
+
+Melanoma samples seem to cluster together, let's see how well they correlate:
+
+```{r scatter_melanoma, results=F, message=F}
+# Find names of melanoma samples
+usecols <- meta[meta$tissue.ch1 == "Melanoma", ]$title
+
+## Plot correlation of first two samples
+# Select data for first two samples
+data.corr <- data %>% subset(select = c(usecols[1], usecols[2]))
+
+# Plot
+ggplot(data.corr, aes_string(x = usecols[1], y = usecols[2])) +
+  geom_point()
+
+# Find correlation of all samples
+cor(data[, usecols])
+
+# Find mean of correlations
+cor(data[, usecols]) %>% mean()
+```
+
+## 10) PCA on gene expression profiles of treated cell lines
+
+Next, a Principal Component Analysis (PCA) is performed on the data to assess
+the main sources of variability across the gene expression profiles. In order to
+do this, we first define a simple function that shows the most relevant
+information from the PCA (i.e. the first 2 principal components and how much
+variance each one explains).
+
+```{r, func_pca, results=F, message=F}
+
+# Perform PCA using prcomp function
+# Note: the matrix must be transposed, so samples are rows
+pca.n <- prcomp(t(data))
+
+# Generating function created by colleague that plots PCA data
+plotPCA <- function(pca, pchs = 21, colour = NULL, PCs = c("PC1", "PC2")) {
+  tmp <- summary(pca)
+  # importance: variance explained
+  PCx <- format(round(tmp$importance[2, PCs[1]] * 100, digits = 2), nsmall = 2)
+  PCy <- format(round(tmp$importance[2, PCs[2]] * 100, digits = 2), nsmall = 2)
+  x <- data.frame(pca$x)
+
+  # Merge PCs back to metadata for annotation (may need to change by.y for other metadatas)
+  merged <- x %>%
+    subset(select = PCs) %>%
+    merge(., meta, by.x = "row.names", by.y = "title")
+
+  # Plot
+  ggplot(merged, aes_string(x = PCs[1], y = PCs[2], colour = colour)) +
+    geom_point() +
+    xlab(paste(PCs[1], "(", PCx, "% var. expl.)", sep = "")) +
+    ylab(paste(PCs[2], "(", PCy, "% var. expl.)", sep = ""))
+}
+```
+
+Now let's plot it!
+
+```{r plot_pca, results=F, message=F}
+
+# Run previously generated function
+plotPCA(pca.n, PCs = c("PC1", "PC2"), colour = "tissue.ch1")
+
+########
+# TASK #
+########
+
+# Try plotting the PCA using a different annotation column such as patient
+```
+
+## 11) UMAP
+
+Uniform Manifold Approximation and Projection is a dimensionality reduction algorithm that has gained a lot
+of importance in the last years. Unlike PCA it is a method for non-linear dimensionality reduction. UMAP tries to preserve the global similarity relationship between the data points while projecting them in a lower dimensional space for visualization purposes (here in 2D).
+
+
+```{r umap, results=F, message=F}
+
+umap_data <- umap(t(data))
+
+# Bind umap values with metadata
+merged_umap <- cbind(umap_data, meta)
+
+# Add colnames for umap values
+colnames(merged_umap)[1:2] <- c("UMAP1", "UMAP2")
+
+# Plot
+ggplot(merged_umap, aes(x = UMAP1, y = UMAP2, colour = tissue.ch1)) +
+  geom_point()
+```