-
Notifications
You must be signed in to change notification settings - Fork 1
/
runDada.R
127 lines (95 loc) · 5.57 KB
/
runDada.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
library(dada2)
library(dplyr)
library(assertthat)
library(stringr)
library(readxl)
library(ggplot2)
library(ggbeeswarm)
library(ComplexHeatmap)
library(circlize)
library(phyloseq)
library(phangorn)
library(msa)
#https://benjjneb.github.io/dada2/tutorial.html
read.table("metadata/SRP141397.metadata",sep="\t",header = TRUE) %>%
dplyr::filter(str_detect(experiment_title,'16S')) %>% dplyr::mutate(full1=paste0(paste("raw",study_accession,experiment_accession,run_accession,sep="/"),"_1.fastq")) %>%
dplyr::mutate(full2=paste0(paste("raw",study_accession,experiment_accession,run_accession,sep="/"),"_2.fastq")) %>%
dplyr::mutate(pair1=paste0(paste("intermediates","fastq",run_accession,sep="/"),"_1.fastq.gz")) %>%
dplyr::mutate(pair2=paste0(paste("intermediates","fastq",run_accession,sep="/"),"_2.fastq.gz"))->
sra_metadata
sample.names <- sra_metadata$experiment_title
filtFs <- file.path("intermediates/filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path("intermediates/filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
errF <- learnErrors(filtFs, multithread=TRUE)
#105662880 total bases in 440262 reads from 31 samples will be used for learning the error rates
errR <- learnErrors(filtRs, multithread=TRUE)
dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
dadaRs <- dada(filtRs, err=errR, multithread=TRUE)
#merge overlapping pairs?
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
#construct sequence table
seqtab.chim <- makeSequenceTable(mergers)
#remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab.chim, method="consensus", multithread=TRUE, verbose=TRUE)
noChimeras<-TRUE
if(noChimeras){
seqtab<-seqtab.nochim
}else{
seqtab<-seqtab.chim
}
trseqtab<-t(seqtab)
#assign taxonomy to nochim
taxa <- assignTaxonomy(seqtab, "SILVA_DADA/silva_nr_v123_train_set.fa.gz", multithread=TRUE)
trseqtabdfnoseq<-as.data.frame(trseqtab)
trseqtabdf$seq<-row.names(trseqtabdfnoseq)
abundance<-as.data.frame(colSums(trseqtabdfnoseq))
#this has experimental metadata
tabl11<-read_excel("metadata/table1.xls",sheet = 11)
names(tabl11)<-c("SampleID","Group","Type","Case_Control","Delivery","numReads","nonhost_shotgun_reads")
tabl11$SampleID<-str_replace(tabl11$SampleID,'16S GeneBlock ','POS.control')
tabl11$Type<-factor(tabl11$Type,levels=c("Air Swab","Blank","H2O","Placenta Fetal Side Delivery Biopsy","Placenta Maternal Side Delivery Biopsy","Positive Control (Gene Block)","Positive Control (Shotgun))","Maternal Saliva Microbiome Enrollment","Vaginal Swab Microbiome Enrollment"))
tabl11$Case_Control<-factor(tabl11$Case_Control,levels=c("Preterm","Term","n/a"))
tabl11$Delivery<-factor(tabl11$Delivery,levels=c("SVD","C-Section","n/a"))
tabl11 %>% arrange(Type,Case_Control,Delivery)->meta.sorted
#useful for qiime
meta.sorted %>%
mutate(Type=str_replace_all(pattern = '\\(',replacement='',Type)) %>%
mutate(Type=str_replace_all(pattern = '\\)',replacement='',Type)) %>%
mutate(Description=paste(Type,Case_Control,Delivery,sep="__")) %>%
mutate(Description=str_replace_all(pattern = ' ',replacement = '_',string = Description)) %>%
mutate(BarcodeSequence="") %>%
mutate(LinkerPrimerSequence = "") %>%
select(SampleID,BarcodeSequence,LinkerPrimerSequence,Type,Case_Control,Delivery,Description) %>% filter(SampleID != 'Shotgun Positive Control') %>%
dplyr::rename("#SampleID"="SampleID") -> qiimetable
write.table(qiimetable,"intermediates/mapfile.txt",quote=FALSE,sep="\t",row.names=FALSE)
abundance$sample<-str_replace(row.names(abundance),'_16S','')
names(abundance)<-c("dadareads","SampleID")
abundance<-merge(abundance,meta.sorted,by="SampleID")
abundanceMelted<-reshape2::melt(abundance)
ggplot(abundanceMelted,aes(Type,value))+geom_beeswarm()+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+facet_wrap("variable")
taxadf<-as.data.frame(taxa)
taxadf$seq<-row.names(taxadf)
trseqtabdf %>% merge(taxadf,by="seq") %>% group_by(Phylum,Genus) %>% select(-c(seq,Kingdom,Class,Order,Family)) %>% summarize_all(sum) -> genusCnts
genusCntsNrml<-data.frame(Phylum=genusCnts$Phylum,Genus=genusCnts$Genus,sweep(genusCnts[,-c(1,2)], 2, colSums(genusCnts[,-c(1,2)]), FUN ="/" ))
colnames(genusCntsNrml)<-str_replace(colnames(genusCntsNrml),'_16S','')
keepRows<-rowSums(genusCntsNrml[,-c(1,2)],na.rm = TRUE)>0.001
subset(genusCntsNrml,keepRows)->genusCntsNrmlAboveThreshold
cols<-meta.sorted[sort(sample(1:nrow(meta.sorted),50)),"SampleID"] %>% dplyr::pull('SampleID')
Type<-meta.sorted[match(colnames(genusCntsNrmlAboveThreshold[cols]),meta.sorted$SampleID),"Type"] %>% dplyr::pull('Type')
Delivery<-meta.sorted[match(colnames(genusCntsNrmlAboveThreshold[cols]),meta.sorted$SampleID),"Delivery"] %>% dplyr::pull('Delivery')
Term<-meta.sorted[match(colnames(genusCntsNrmlAboveThreshold[cols]),meta.sorted$SampleID),"Case_Control"] %>% dplyr::pull('Case_Control')
set.seed(3)
col_fun = colorRamp2(c(0, 0.01, .3, 1), c("white", "blue", "green", "red"))
col_fun(seq(0, 1))
ha = HeatmapAnnotation(Type = Type, Delivery=Delivery,Term=Term,annotation_name_side = "left")
Heatmap(as.matrix(genusCntsNrmlAboveThreshold[,cols]),
col=col_fun,
top_annotation = ha,
cluster_columns = FALSE,
cluster_rows = FALSE,
show_row_dend = FALSE) +
Heatmap(genusCntsNrmlAboveThreshold[,"Phylum"], name = "Phylum",
top_annotation = HeatmapAnnotation(summary = anno_summary(height = unit(2, "cm"))),
width = unit(15, "mm"))