-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCodes_Microbiome_Informatics.R
180 lines (146 loc) · 7.75 KB
/
Codes_Microbiome_Informatics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
library(dada2)
#Getting ready
# First set directory, where you have downloaded the raw FASTQ files
path <- "" # CHANGE the directory containing the fastq files after unzipping.
list.files(path)
# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
fnFs <- sort(list.files(path, pattern="_R1_001.fastq", full.names = TRUE))
fnFs
fnRs <- sort(list.files(path, pattern="_R2_001.fastq", full.names = TRUE))
fnRs
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
sample.names
#Inspect read quality profiles
#We start by visualizing the quality profiles of the forward reads
plotQualityProfile(fnFs[1:2])
#Now we visualize the quality profile of the reverse reads
plotQualityProfile(fnRs[1:2])
#Filter and trim
#Assign the filenames for the filtered fastq.gz files.
# Place filtered files in filtered/ subdirectory
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
filtFs
filtRs
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(240,160), maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=FALSE) # On Windows set multithread=FALSE
head(out)
#Learn the Error Rates
errF <- learnErrors(filtFs, multithread=FALSE)
errR <- learnErrors(filtRs, multithread=FALSE)
#plotErrors(errF, nominalQ=TRUE)
#Sample Inference
dadaFs <- dada(filtFs, err=errF, multithread=FALSE)
dadaRs <- dada(filtRs, err=errR, multithread=FALSE)
dadaFs[[1]]
#Merge paired reads
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
# Inspect the merger data.frame from the first sample
head(mergers[[1]])
#View(mergers$F3D0)
#Construct sequence table
seqtab <- makeSequenceTable(mergers)
# View(seqtab)
# Inspect distribution of sequence lengths
#table(nchar(getSequences(seqtab)))
#Remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=FALSE, verbose=TRUE)
#dim(seqtab.nochim)
#sum(seqtab.nochim)/sum(seqtab)
#Track reads through the pipeline
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
head(track)
#Assign taxonomy
taxa <- assignTaxonomy(seqtab.nochim, "silva_nr99_v138.1_train_set.fa.gz", multithread=FALSE)
#Let’s inspect the taxonomic assignments:
View(taxa)
taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)
#Downstream analysis
library(phyloseq)
samples.out <- rownames(seqtab.nochim)
subject <- sapply(strsplit(samples.out, "D"), `[`, 1)
gender <- substr(subject,1,1)
subject <- substr(subject,2,999)
day <- as.integer(sapply(strsplit(samples.out, "D"), `[`, 2))
samdf <- data.frame(Subject=subject, Gender=gender, Day=day)
samdf$When <- "Early"
samdf$When[samdf$Day>100] <- "Late"
rownames(samdf) <- samples.out
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), sample_data(samdf), tax_table(taxa))
dna <- Biostrings::DNAStringSet(taxa_names(ps))
names(dna) <- taxa_names(ps)
ps <- merge_phyloseq(ps, dna)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
ps
View(ps@otu_table)
View(ps@tax_table)
View(ps@sam_data)
#Off to microeco
library(file2meco)
library(microeco)
library(ggplot2)
library(ggdendro)
library(ggpubr)
library(magrittr)
meco_object <- phyloseq2meco(ps)
meco_object$tidy_dataset()
#Filter data
meco_object$filter_pollution(taxa = c("mitochondria", "chloroplast"))
#Alpha Diversity
t1 <- trans_alpha$new(dataset = meco_object, group = "When")
t1$cal_diff(method = "wilcox")
# y_increase: increased height for each label
t1$plot_alpha(measure = "Shannon", shape = "When", y_start = 0.1, y_increase = 0.1, xtext_size = 15) + theme(axis.text.y = element_text(size = 12, vjust = 0.5, face = "bold"), axis.text.x = element_text(size = 16, face = "bold"),panel.background = element_blank(), axis.title = element_text(size = 12, face = "bold"), axis.line = element_line(colour = "black"), legend.text = element_text(size = 12))
ggsave("alpha.tiff", height = 6, width = 6, dpi = 700)
#Beta Diversity.
meco_object$cal_betadiv()
t2 <- trans_beta$new(dataset = meco_object, group = "When", measure = "bray")
# PCoA, PCA, DCA and NMDS are available
t2$cal_ordination(method = "PCoA")
# plot the PCoA result with confidence ellipse
t2$plot_ordination(plot_color = "When", plot_shape = "When", plot_type = c("point", "ellipse")) + theme(axis.text.y = element_text(size = 12, vjust = 0.5, face = "bold"), axis.text.x = element_text(size = 16, face = "bold"),panel.background = element_blank(), axis.title = element_text(size = 12, face = "bold"), axis.line = element_line(colour = "black"), legend.text = element_text(size = 12))
ggsave("beta.tiff", height = 6, width = 6, dpi = 700)
# #Clustering
# meco_object$sample_table %<>% subset(When %in% c("Early", "Late"))
# meco_object$tidy_dataset()
# t3 <- trans_beta$new(dataset = meco_object, group = "When")
# # use replace_name to set the label name, group parameter used to set the color
# t3$plot_clustering(group = "When", replace_name = c("When"))
# ggsave("cluster.tiff", height = 6, width = 6, dpi = 700)
#Microbial composition
t4 <- trans_abund$new(dataset = meco_object, taxrank = "Genus", ntaxa = 10)
t4$plot_bar(others_color = "grey70", facet = "When", xtext_keep = FALSE, legend_text_italic = TRUE)
ggsave("abundance_1.tiff", height = 6, width = 10, dpi = 700)
# The groupmean parameter can be used to obtain the group-mean barplot.
t5 <- trans_abund$new(dataset = meco_object, taxrank = "Genus", ntaxa = 10, groupmean = "When")
g1 <- t5$plot_bar(bar_full = FALSE, legend_text_italic = TRUE)+theme(axis.text.y = element_text(size = 12, vjust = 0.5, face = "bold"), axis.text.x = element_text(size = 16, face = "bold"),panel.background = element_blank(), axis.title = element_text(size = 12, face = "bold"), axis.line = element_line(colour = "black"), legend.text = element_text(size = 12))
g1
ggsave("Abundance.tiff", height = 6, width = 6, dpi = 700)
# #Heatmap
# t6 <- trans_abund$new(dataset = meco_object, taxrank = "Genus", ntaxa = 15)
# g2 <- t6$plot_heatmap(facet = "When", xtext_keep = FALSE, withmargin = FALSE, plot_breaks = c(0.01, 0.1, 1, 10))
# g2
# #Boxplot
# t7 <- trans_abund$new(dataset = meco_object, taxrank = "Genus", ntaxa = 15)
# t7$plot_box(group = "When", xtext_angle = 90) +theme(axis.text.y = element_text(size = 12, vjust = 0.5, face = "bold"), axis.text.x = element_text(size = 10, face = "bold"),panel.background = element_blank(), axis.title = element_text(size = 12, face = "bold"), axis.line = element_line(colour = "black"), legend.text = element_text(size = 12))
#Differential analysis
#LEfSE
t8 <- trans_diff$new(dataset = meco_object, method = "lefse", group = "When", alpha = 0.05, lefse_subgroup = NULL, taxa_level = "Genus", p_adjust_method = "fdr")
# see t1$res_diff for the result
# From v0.8.0, threshold is used for the LDA score selection.
g3 <- t8$plot_diff_bar(threshold = 2)
g3
ggsave("LDA.tiff", height = 6, width = 10, dpi = 700)
#BiocManager::install("ALDEx2")
#ALDEx2_kw
#t9 <- trans_diff$new(dataset = meco_object, method = "ALDEx2_kw", group = "When", filter_thres = 0.01, taxa_level = "Genus")
#t9$plot_diff_abund(add_sig = TRUE, use_number = 1:12, simplify_names = FALSE)
#ggsave("ALDEx2_kw.tiff", height = 8, width = 12, dpi = 700)