diffExp.Rmd

---
title: "Ant1-KO Differential Expression"
author: "Jeremy Leipzig"
date: "2/16/2016"
output:
  html_document:
    toc: true
---

```{r libs, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
library(Rsamtools)
library(Biostrings)
library(stringr)
library(tidyr)
library(reshape2)
library(DESeq)
library(ggplot2)
source("/nas/is1/leipzig/ganguly/src/R/concat.R")
library(org.Mm.eg.db)
library(biomaRt)
library(DT)
library(xlsx)
library(dplyr)
library(knitr)
library(printr)
library(VennDiagram)
library(UpSetR)
```


```{r setup, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
sessInfo<-sessionInfo()

aligner<-"STAR"
reference<-"Ensembl/GRCm38"

load("diffExp.state.RData")

fetchCounts<-function(sampleName){
  counts<-read.table(concat("counts/",sampleName,".tsv"),col.names=c("txid",sampleName))
  row.names(counts)<-counts$txid
  return(counts[,-1,drop=FALSE])
}

counts <- Reduce(function(a,b){
        ans <- merge(a,b,by="row.names",all=T)
        row.names(ans) <- ans[,"Row.names"]
        ans[,!names(ans) %in% "Row.names"]
        }, lapply(samples,fetchCounts))

counts<-counts[!(rownames(counts) %in% c("alignment_not_unique","ambiguous")),]
names(counts)<-c(concat('MUS_ANT_',seq(1:4)),concat('MUS_B6ME_',seq(1:4)),concat('HRT_ANT_',seq(1:4)),concat('HRT_B6ME_',seq(1:4)))
#MUSCLE_KO  MUSCLE_WT HEART_KO HEART_WT
conds<-c(rep('MT',4),rep('MN',4),rep('HT',4),rep('HN',4))

#ANT1 KO
#B6ME WT
pdata<-data.frame(condition=conds)
rownames(pdata)<-c(concat('MUS_ANT_',seq(1:4)),concat('MUS_B6ME_',seq(1:4)),concat('HRT_ANT_',seq(1:4)),concat('HRT_B6ME_',seq(1:4)))

#at least minCount from the paired lanes
minCount<-100
countsAboveThreshold<-subset(counts,rowSums(counts)>minCount)
#subset(counts,rowSums(counts[-1])>minCount)

countsMatrix<-as.matrix(countsAboveThreshold)
rownames(countsMatrix)<-rownames(countsAboveThreshold)

starstatsraw<-read.table(STARLOGS,sep="\t",header=TRUE,colClasses=c('character','integer','character','character'))
row.names(starstatsraw)<-starstatsraw$Sample
starstats<-starstatsraw[c(MUSCLE_KO,MUSCLE_WT,HEART_KO,HEART_WT),]
starstats$Sample<-concat(str_sub(starstats$Sample,1,16),'...')
starstats$cond<-conds

cds <- newCountDataSet( countsMatrix, pdata$condition )

ercc.counts<-read.table(ERCC_COUNTS,col.names=c("sample","count"),stringsAsFactors=FALSE)

#these better be in the same order
stopifnot(samples==ercc.counts$sample)
sizeFactors(cds)<-ercc.counts$count

write.table(counts(cds, normalized=FALSE),file=RAW_COUNTS,sep="\t",quote=FALSE)
write.table(counts(cds, normalized=TRUE),file=NORM_COUNTS,sep="\t",quote=FALSE)

#not going to estimate if using ercc
#cds <- estimateSizeFactors( cds )

#fits: parameteric or local
#methods: pooled per-condition
#sharingMode="fit-only"
#gene-est-only maximum 
sharingMode<-'maximum'
method<-'per-condition'
fits<-'local'
cds <- estimateDispersions( cds , fitType="local", method=method, sharingMode=sharingMode)
save(cds,file=CDS_FILE,compress=TRUE)
myPval<-.01
```

```{r conductTest, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
#in prep for join, remember existing goes on the right
#-Inf in Log2FoldChange causes a lot of display problems, set to extremely low value
nbinomTest( cds, "MN", "MT") %>% rename(ensembl_gene_id = id) %>% mutate(log2FoldChange = ifelse(log2FoldChange == -Inf, log2(0.000001), log2FoldChange)) %>% mutate(log2FoldChange = ifelse(log2FoldChange == Inf, log2(1000000), log2FoldChange)) -> testres
resfile<-MUSCLE_RES
tissue<-"muscle"
```

```{r annosetup, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
#http://www.bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/biomaRt.pdf
#mart <- useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
#https://support.bioconductor.org/p/74304/
mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="mmusculus_gene_ensembl", host = "jul2015.archive.ensembl.org")


anno_entrez <- getBM(attributes = c("ensembl_gene_id","external_gene_name","entrezgene"),
                 filters = "ensembl_gene_id", values = testres$ensembl_gene_id,
                 mart = mart)

human = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host = "jul2015.archive.ensembl.org")

get(load("cardiomyopathy.RData")) %>% rename(entrez.human=c2.geneId,cardiomyopathy_score=c0.score)->cmyo_genes

read.table("mseqdr.txt",stringsAsFactors=FALSE) %>% rename(UPGENE=V1) %>% mutate(MSeqDR = paste("<a href=\"https://mseqdr.org/MITO_backend/genes/",UPGENE,"\">",UPGENE,"</a>",sep=""))-> mitogenes
```

```{r annotate, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
getLDS(attributesL = c("entrezgene"),
       filters = "entrezgene", values = anno_entrez$entrezgene, mart = mart,
       attributes = c("entrezgene"), martL = human) %>% rename(entrezgene=EntrezGene.ID,entrez.human=EntrezGene.ID.1) -> homologs

#join to cardiomyopathy tables using homologues, remove dups
anno_entrez %>% 
  join(homologs,by="entrezgene",type="left", match = "first") %>% 
  join(cmyo_genes,by="entrez.human",type="left", match = "first") %>% 
  join(testres, by="ensembl_gene_id", type="right") %>% 
  mutate(UPGENE=toupper(external_gene_name)) %>% 
  left_join(mitogenes,by = "UPGENE") %>%
  select(-UPGENE) %>%
  group_by(ensembl_gene_id) %>% 
  filter(row_number() == 1) %>% 
  rename(gene=external_gene_name) -> res

write.csv(res,file=resfile)
save(res,file=paste(resfile,"RData",sep="."),compress=TRUE)

#genego convenience
res %>% filter(!is.na(gene) & !is.na(log2FoldChange)) %>% ungroup %>% select(gene,log2FoldChange,padj) -> genego
write.xlsx(data.frame(genego), file=paste(tissue,"genego.xls",sep="."), row.names=FALSE)
```

# Introduction
## Samples
Below are results of the STAR mapping. Conditions are M for Muscle, H for Heart, T for Treatment(ANT1), N for Normal (B6ME).
```{r star, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
datatable(starstats,caption=paste("STAR mapping results"))
```

# Alignment
After trimming, transcript counts were generated by a STAR alignment to genomic sequences.

The STAR  used for the samples is as follows:
```
{STAR} --genomeDir {STARREFDIR} --outFileNamePrefix {wildcards.sample}_ --readFilesIn {input} --runThreadN 24 
--genomeLoad NoSharedMemory --outSAMattributes All --outSAMstrandField intronMotif --sjdbGTFfile {GTFFILE}
```

## Normalization
ERCC spike-ins were downloaded from [here](http://tools.invitrogen.com/downloads/ERCC92.fa) and used as a reference. Total counts from all 92 sequences are below. These are used for the *sizeFactors* in DESeq such that they become divisors for each sample count.
```{r ercc, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
datatable(ercc.counts,caption=paste("Total ERCC read counts"))
```


# DESeq
A reanalysis of the non-normalized count data was performed using the
R-based RNA-Seq analysis package DESeq [Anders](http://dx.doi.org/10.1038/npre.2010.4282.2) in order to
detect differential expression between normal and treatment.

This report uses DESeq version `r sessInfo$otherPkgs$DESeq$Version`.

DESeq offers different statistical models and parameters to choose from depending on the data to be analyzed.

Empirical dispersion can be computed using a *pooled*, *per-condition*, or *blind* (no replicates) method.

The following, sharing modes, which determine how much information is used to inform the dispersion of other genes, are available:

* `fit-only` is appropriate for only a few replicates
* `maximum` is more conservative for three or four replicates, 
* `gene-est-only` is more aggressive, and is ideal for many replicates

For this report, we have used the `r method` method and `r sharingMode` sharing mode.

Only genes with a combined count of `r minCount` among all the samples (both conditions and tissues)
`r nrow(countsAboveThreshold)` hits fit this criteria.

# Muscle results
## Significantly DE genes in muscle
Significantly differentially expressed genes (padj<`r myPval`). Fold change is ANT1/B6ME.

The `cmyo` column is the cardiomyopathy involvement score from http://www.disgenet.org.

The `MSeqDR` column will provide a link if the gene in question is one of 1363 MSeqDR mitochondrial genes.

```{r dePval, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
      
#> names(res)
# [1] "ensembl_gene_id"      "entrez.human"         "entrezgene"           "gene"
# "cardiomyopathy_score" "baseMean"             "baseMeanA"            "baseMeanB"            "foldChange"          
#[10] "log2FoldChange"       "pval"                 "padj"        
# > head(res)
#      ensembl_gene_id entrez.human entrezgene  gene cardiomyopathy_score     baseMean    baseMeanA    baseMeanB foldChange log2FoldChange         pval         padj
# 1 ENSMUSG00000000001         2773      14679 Gnai3                   NA 9.237222e-06 1.344489e-05 5.029555e-06  0.3740867     -1.4185553 1.365943e-07 5.695712e-06
# 2 ENSMUSG00000000028         8318      12544 Cdc45                   NA 8.833663e-07 1.043980e-06 7.227526e-07  0.6923049     -0.5305205 5.979778e-01 7.214577e-01

res %>% select(gene,ensid=ensembl_gene_id,baseMean,bNorm=baseMeanA,bRB=baseMeanB,fc=foldChange,log2fc=log2FoldChange,pval,padj,cmyo=cardiomyopathy_score,MSeqDR) %>% filter(padj<myPval) %>% arrange(pval)->resSig

datatable(resSig, escape=FALSE, caption=paste("Significant (P<",myPval,") differential expression in ANT1 after the correction for multiple testing.",nrow(resSig),"genes."))
```

## Upregulated genes in muscle
```{r upreg, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
upRegs<-head(subset(resSig[order(-resSig$fc),],fc>1),n=50)

if(nrow(upRegs)>0){
  datatable(upRegs,escape=FALSE,caption=paste("Strongly significantly upregulated genes in ANT1 (top 50)",sep=""))
}else{
  print("No significantly upregulated genes")
}
```


## Downregulated genes in muscle
```{r downreg, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
downRegs<-head(subset(resSig[order(resSig$fc),],fc<=1),n=50)

if(nrow(downRegs)>0){
  datatable(downRegs,escape=FALSE,caption=paste("Strongly significantly downregulated genes in ANT1 (top 50)",sep=""))
}else{
  print("No significantly downregulated genes")
}
```

## Log2 Fold change and significance muscle
Genes with high fold change may not be significantly differentially expressed simply due to high variance.

```{r fcVsBasemeanmuscle, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
ggplot(res,aes(x=log2(baseMean),y=log2FoldChange))+geom_point(aes(color=padj<.01))
```

# Heart results
## Significantly DE genes in heart
```{r deheart, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
nbinomTest( cds, "HN", "HT") %>% rename(ensembl_gene_id = id) %>% mutate(log2FoldChange = ifelse(log2FoldChange == -Inf, log2(0.000001), log2FoldChange)) %>% mutate(log2FoldChange = ifelse(log2FoldChange == Inf, log2(1000000), log2FoldChange)) -> testres
resfile<-HEART_RES
tissue<-"heart"
```

```{r heart_anno, ref.label='annotate', echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
```

```{r heart_depval, ref.label='dePval', echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
```

## Upregulated genes in heart
```{r upheart, ref.label='upreg', echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
```

## Downregulated genes in heart
```{r downheart, ref.label='downreg', echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
```

## Log2 Fold Change and Significance heart
Genes with high log2 fold change may not be significantly differentially expressed simply due to high variance.

```{r fcVsBasemeanheart, echo=FALSE, message=FALSE, cache=FALSE, warning=FALSE, fig.path='{{SLINK}}/'}
qplot(log2(res$baseMean),res$log2FoldChange)+aes(color=res$padj<.01)
```

# Eucludian distances
```{r euclidheat, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
dists<-dist(t(counts(cds)))
heatmap( as.matrix( dists ))
```

# Comparing Heart and Muscle
```{r hvm, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
extremeval=1000
get(load("muscleResults.csv.RData")) %>% filter(!is.na(log2FoldChange)) %>% mutate(log2FoldChange = ifelse(log2FoldChange == -Inf, log2(1/extremeval), log2FoldChange)) %>% mutate(log2FoldChange = ifelse(log2FoldChange == Inf, log2(extremeval), log2FoldChange)) -> muscle_results

get(load("heartResults.csv.RData")) %>% filter(!is.na(log2FoldChange)) %>% mutate(log2FoldChange = ifelse(log2FoldChange == -Inf, log2(1/extremeval), log2FoldChange)) %>% mutate(log2FoldChange = ifelse(log2FoldChange == Inf, log2(extremeval), log2FoldChange)) -> heart_results

merge(heart_results,muscle_results,by="ensembl_gene_id") %>% select (ensembl_gene_id,log2FoldChange.x,log2FoldChange.y, padj.x, padj.y) %>% mutate(heartsig=padj.x<0.05) %>% mutate(musclesig=padj.y<0.05) %>%  rename(heart=log2FoldChange.x,muscle=log2FoldChange.y) %>% mutate(sig=ifelse(heartsig,ifelse(musclesig,"both","heartonly"),ifelse(musclesig,"muscleonly","neither"))) %>% 
mutate(heartdir = ifelse(heartsig,ifelse(heart>0,"Up","Down"),"Unchanged"),muscledir = ifelse(musclesig,ifelse(heart>0,"Up","Down"),"Unchanged")) -> hvsm

hvsm %>% select(ensembl_gene_id,heartdir,muscledir) %>% rename(gene=ensembl_gene_id,Heart=heartdir,Muscle=muscledir) %>% gather(tissue,movestate,c(Heart,Muscle)) %>% dcast(gene ~ tissue+movestate, function(x) as.integer(1), fill = as.integer(0)) -> binarymatrix
binarymatrix$gene<-as.factor(binarymatrix$gene)
```

### Contingency table
Significantly differentially expressed genes after adjustment for multiple comparisons (padj<0.05). Heart is the left - muscle on the top.

```{r ctkvm, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
table(heart=hvsm$heartdir,muscle=hvsm$muscledir)
```


### Proportion table: Significantly differentially expressed genes after adjustment for multiple comparisons (padj<0.05). Heart is the left - muscle on the top.

```{r prptkvm, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
round(prop.table(table(heart=hvsm$heartdir,muscle=hvsm$muscledir))*100,2)
```

### UpSet Plot
All 9 intersections are shown in this innovative UpSet set visualization technique published by Lex, Gehlenborg, et al.
```{r upset, echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
 intersections <- list(list("Heart_Up", "Muscle_Down"), 
                       list("Heart_Up","Muscle_Unchanged"),
                      list("Heart_Up","Muscle_Up"),
                      list("Heart_Down", "Muscle_Up"), 
                      list("Heart_Down","Muscle_Unchanged"),
                      list("Heart_Down","Muscle_Down"),
                      list("Heart_Unchanged","Muscle_Up"),
                      list("Heart_Unchanged","Muscle_Unchanged"),
                      list("Heart_Unchanged","Muscle_Down"))

 sets<-rev(c("Heart_Up","Heart_Unchanged","Heart_Down","Muscle_Up","Muscle_Unchanged","Muscle_Down"))
queries <- list(
    list(query = intersects, params = list("Heart_Up", 
      "Muscle_Up"), color = "red", active = T),
    list(query = intersects, params = list("Heart_Up", 
      "Muscle_Down"), color = "black", active = T),
    list(query = intersects, params = list("Heart_Up", 
      "Muscle_Unchanged"), color = "orange", active = T),
    list(query = intersects, params = list("Heart_Down", 
      "Muscle_Down"), color = "blue", active = T),
    list(query = intersects, params = list("Heart_Down", 
      "Muscle_Up"), color = "black", active = T),
    list(query = intersects, params = list("Heart_Down", 
      "Muscle_Unchanged"), color = "purple", active = T),
    list(query = intersects, params = list("Heart_Unchanged", 
      "Muscle_Up"), color = "orange", active = T),
    list(query = intersects, params = list("Heart_Unchanged", 
      "Muscle_Down"), color = "purple", active = T),
    list(query = intersects, params = list("Heart_Unchanged", 
      "Muscle_Unchanged"), color = "grey", active = T)
  )
upset(binarymatrix,sets=sets,intersections = intersections,order.by = c("degree", "freq"), 
  queries = queries
)


```

# Session info
The R session information (including the OS info, R version and all
packages used):

```{r session, results='markup', echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
sessionInfo()
Sys.time()
```

```{r git, results='hide', echo=FALSE, message=FALSE, cache=FALSE,  fig.path='{{SLINK}}/'}
commit<-system("git rev-parse --verify HEAD",intern=TRUE)
```

Git Commit:
`r commit`