update DE Vis Advanced section to use Stringtie/DESeq2 results

griffithlab · Apr 25, 2024 · 2248ec4 · 2248ec4
1 parent 351ebbf
commit 2248ec4
Show file tree

Hide file tree

Showing 2 changed files with 141 additions and 203 deletions.
diff --git a/_posts/0003-04-01-DE_Visualization_Ballgown.md b/_posts/0003-04-01-DE_Visualization_Ballgown.md
@@ -31,7 +31,6 @@ A separate R tutorial file has been provided below. Run the R commands detailed
 First you'll need to load the libraries needed for this analysis and define a path for the output PDF to be written.
 
 ```R
-###R code###
 #load libraries
 library(ballgown)
 library(genefilter)
@@ -45,7 +44,7 @@ outfile="~/workspace/rnaseq/de/ballgown/ref_only/Tutorial_Part2_ballgown_output.
 Next we'll load our data into R.
 
 ```R
-###R code###
+
 # Generate phenotype data
 ids = c("UHR_Rep1","UHR_Rep2","UHR_Rep3","HBR_Rep1","HBR_Rep2","HBR_Rep3")
 type = c("UHR","UHR","UHR","HBR","HBR","HBR")
@@ -65,14 +64,39 @@ ls()
 
 # Print a summary of the ballgown object
 bg
-```
 
-Now we'll start to generate our figures with the following R code.
+# Load gene names for lookup later in the tutorial
+bg_table = texpr(bg, 'all')
+bg_gene_names = unique(bg_table[, 9:10])
 
-```R
-###R code###
-# Open a PDF file where we will save some plots. We will save all figures and then view the PDF at the end
-pdf(file=outfile)
+# Pull the gene and transcript expression data frame from the ballgown object
+gene_expression = as.data.frame(gexpr(bg))
+transcript_expression = as.data.frame(texpr(bg))
+
+#View expression values for the transcripts of a particular gene symbol of chromosome 22.  e.g. 'TST'
+#First determine the transcript_ids in the data.frame that match 'TST', aka. ENSG00000128311, then display only those rows of the data.frame
+i=bg_table[,"gene_id"]=="ENSG00000128311"
+bg_table[i,]
+
+# Display the transcript ID for a single row of data
+ballgown::transcriptNames(bg)[2763]
+
+# Display the gene name for a single row of data
+ballgown::geneNames(bg)[2763]
+
+#What if we want to view values for a list of genes of interest all at once?
+#genes_of_interest = c("TST", "MMP11", "LGALS2", "ISX")
+genes_of_interest = c("ENSG00000128311","ENSG00000099953","ENSG00000100079","ENSG00000175329")
+i = bg_table[,"gene_id"] %in% genes_of_interest
+bg_table[i,]
+
+# Load the transcript to gene index from the ballgown object
+transcript_gene_table = indexes(bg)$t2g
+head(transcript_gene_table)
+
+#Each row of data represents a transcript. Many of these transcripts represent the same gene. Determine the numbers of transcripts and unique genes
+length(row.names(transcript_gene_table)) #Transcript count
+length(unique(transcript_gene_table[,"g_id"])) #Unique Gene count
 
 # Extract FPKM values from the 'bg' object
 fpkm = texpr(bg,meas="FPKM")
@@ -86,39 +110,58 @@ fpkm = log2(fpkm+1)
 # View the last several rows of the transformed FPKM table
 tail(fpkm)
 
-# Create boxplots to display summary statistics for the FPKM values for each sample
-boxplot(fpkm,col=as.numeric(as.factor(pheno_data$type))+1,las=2,ylab='log2(FPKM+1)')
-
-# col=as.numeric(as.factor(pheno_data$type))+1 - set color based on pheno_data$type which is UHR vs. HBR
-# las=2 - labels are perpendicular to axis 
-# ylab='log2(FPKM+1)' - set ylab to indicate that values are log2 transformed
+```
 
-# Display the transcript ID for a single row of data
-ballgown::transcriptNames(bg)[2763]
+Now we'll start to generate figures with the following R code.
 
-# Display the gene name for a single row of data
-ballgown::geneNames(bg)[2763]
+```R
 
-# Create a BoxPlot comparing the expression of a single gene for all replicates of both conditions
-boxplot(fpkm[2763,] ~ pheno_data$type, border=c(2,3), main=paste(ballgown::geneNames(bg)[2763],' : ', ballgown::transcriptNames(bg)[2763]),pch=19, xlab="Type", ylab='log2(FPKM+1)')
+# Open a PDF file where we will save some plots. We will save all figures and then view the PDF at the end
+pdf(file=outfile)
 
-# border=c(2,3) - set border color for each of the boxplots
-# main=paste(ballgown::geneNames(bg)[2763],' : ', ballgown::transcriptNames(bg)[2763]) - set title to gene : transcript
-# xlab="Type" - set x label to Type
-# ylab='log2(FPKM+1)' - set ylab to indicate that values are log2 transformed
+#### Plot #1 - the number of transcripts per gene.
+#Many genes will have only 1 transcript, some genes will have several transcripts
+#Use the 'table()' command to count the number of times each gene symbol occurs (i.e. the # of transcripts that have each gene symbol)
+#Then use the 'hist' command to create a histogram of these counts
+#How many genes have 1 transcript?  More than one transcript?  What is the maximum number of transcripts for a single gene?
+counts=table(transcript_gene_table[,"g_id"])
+c_one = length(which(counts == 1))
+c_more_than_one = length(which(counts > 1))
+c_max = max(counts)
+hist(counts, breaks=50, col="bisque4", xlab="Transcripts per gene", main="Distribution of transcript count per gene")
+legend_text = c(paste("Genes with one transcript =", c_one), paste("Genes with more than one transcript =", c_more_than_one), paste("Max transcripts for single gene = ", c_max))
+legend("topright", legend_text, lty=NULL)
+
+#### Plot #2 - the distribution of transcript sizes as a histogram
+#In this analysis we supplied StringTie with transcript models so the lengths will be those of known transcripts
+#However, if we had used a de novo transcript discovery mode, this step would give us some idea of how well transcripts were being assembled
+#If we had a low coverage library, or other problems, we might get short 'transcripts' that are actually only pieces of real transcripts
+
+hist(bg_table$length, breaks=50, xlab="Transcript length (bp)", main="Distribution of transcript lengths", col="steelblue")
+
+#### Plot #3 - distribution of gene expression levels for each sample
+# Create boxplots to display summary statistics for the FPKM values for each sample
+# set color based on pheno_data$type which is UHR vs. HBR
+# set labels perpendicular to axis (las=2)
+# set ylab to indicate that values are log2 transformed
+boxplot(fpkm,col=as.numeric(as.factor(pheno_data$type))+1,las=2,ylab='log2(FPKM+1)')
 
+#### Plot 4 - BoxPlot comparing the expression of a single gene for all replicates of both conditions
+# set border color for each of the boxplots
+# set title (main) to gene : transcript
+# set x label to Type
+# set ylab to indicate that values are log2 transformed
+boxplot(fpkm[2763,] ~ pheno_data$type, border=c(2,3), main=paste(ballgown::geneNames(bg)[2763],' : ', ballgown::transcriptNames(bg)[2763]),pch=19, xlab="Type", ylab='log2(FPKM+1)')
 
 # Add the FPKM values for each sample onto the plot
+# set plot symbol to solid circle, default is empty circle
 points(fpkm[2763,] ~ jitter(c(2,2,2,1,1,1)), col=c(2,2,2,1,1,1)+1, pch=16)
-# pch=16 - set plot symbol to solid circle, default is empty circle
 
 
-# Create a plot of transcript structures observed in each replicate and color transcripts by expression level
+#### Plot 5 - Plot of transcript structures observed in each replicate and color transcripts by expression level
 plotTranscripts(ballgown::geneIDs(bg)[2763], bg, main=c('TST in all HBR samples'), sample=c('HBR_Rep1', 'HBR_Rep2', 'HBR_Rep3'), labelTranscripts=TRUE)
 plotTranscripts(ballgown::geneIDs(bg)[2763], bg, main=c('TST in all UHR samples'), sample=c('UHR_Rep1', 'UHR_Rep2', 'UHR_Rep3'), labelTranscripts=TRUE)
 
-#plotMeans('TST',bg,groupvar="type",legend=FALSE)
-
 # Close the PDF device where we have been saving our plots
 dev.off()
 
@@ -130,5 +173,3 @@ Remember that you can view the output graphs of this step on your instance by na
 
 http://**YOUR_PUBLIC_IPv4_ADDRESS**/rnaseq/de/ballgown/ref_only/Tutorial_Part2_ballgown_output.pdf
 
-The above code can be found in a single R script located [here](https://github.com/griffithlab/rnabio.org/blob/master/assets/scripts/Tutorial_Part2_ballgown.R).
-