Skip to content

Commit 2568306

Browse files
committed
V0.1 - Preliminary analysis at the submission date of DFG grant proposal
1 parent 8b1dcc9 commit 2568306

5 files changed

+62
-10361
lines changed

R/Hyena_first_rough.R

+56-9
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ OTUs <- do.call(rbind, OTU.all)
4040
rownames(OTUs) <- OTUs$OTUID
4141
OTUs$OTUID <- NULL
4242

43+
sum(OTUs)
44+
4345
devSVG("figures/Hyena_OTU_heat.svg", width=14, height=14)
4446
pheatmap(log10(OTUs+1),
4547
show_rownames=FALSE,
@@ -67,7 +69,39 @@ pheatmap(log10(OTUs+1),
6769
dev.off()
6870

6971

72+
## a crude background reduction by setting all counts below an outlier detection to zero
73+
## http://stats.stackexchange.com/questions/56402/detecting-outliers-in-count-data
74+
out.z <- function(x){
75+
trans <- log10(as.numeric(x))
76+
## a trick to not assess the distribution of zeros, ones and twos
77+
## assumed here to be true negatives
78+
NN <- which(trans>0)
79+
rob.z <- (trans-median(trans[NN]))/mad(trans[NN])
80+
z.outl <- which(!rob.z>quantile(rob.z[NN], 0.05, na.rm=TRUE))
81+
}
82+
83+
for(i in 1:nrow(OTUs)){
84+
OTUs[i, out.z(OTUs[i,])] <- 0
85+
}
86+
87+
sum(OTUs)
88+
89+
## 90,427 removed 06/09/2016
90+
91+
png("figures/Hyena_OTU_heat_BCcor.png", res=300, width = 1480, height = 1480)
92+
pheatmap(log10(OTUs+1),
93+
show_rownames=FALSE,
94+
show_colnames=TRUE,
95+
treeheight_row=0,
96+
treeheight_col=0,
97+
annotation_col=data.frame(row.names=colnames(OTUs),
98+
is.control=as.numeric(
99+
grepl("H2O|Argave|Wolf|Paramix",
100+
colnames(OTUs)))),
101+
annotation_legend = FALSE)
102+
dev.off()
70103

104+
sum(OTUs)
71105

72106
amplicon <- gsub("OTU\\d+\\|", "", rownames(OTUs))
73107

@@ -91,8 +125,9 @@ pheatmap(log10(t(SUM.amp)+1),
91125
annotation_legend = FALSE)
92126
dev.off()
93127

94-
TAX.raw <- read.csv("/SAN/Metabarcoding/Hyena/second/sorted_amps/usearch/ALL_outs.taxtable",
95-
sep = ",")
128+
TAX.raw <-
129+
read.csv("/SAN/Metabarcoding/Hyena/second/sorted_amps/usearch/ALL_outs_nt.taxtable",
130+
sep = ",")
96131

97132
TAX.raw$query <- gsub(".fastq.otus.fa", "", TAX.raw$query)
98133

@@ -104,31 +139,39 @@ T.l <- by(TAX.raw, TAX.raw$query, function (x) {
104139
all.best <- x[x$bitscore==b.bit, ]
105140
### A little last common ancestor play here... BUT wait ...
106141
## a shortcut throwing out OTUs that don't agree at least on the
107-
## family level and allowing only the best hit afterwards
108-
u.family <- unique(all.best$family)
142+
## class level and allowing only the best hit afterwards
143+
u.family <- unique(all.best$class)
109144
if(length(u.family)==1){
110145
return(all.best)
111146
}
112147
})
113148

114149

115150
TAX <- do.call(rbind, T.l)
116-
head(TAX[order(TAX$amplicon), ])
151+
tail(TAX[order(TAX$amplicon), ])
117152
rownames(TAX) <- NULL
118153

154+
155+
119156
## Only consider Euks now
120157
TAX <- TAX[TAX$superkingdom%in%"Eukaryota", ]
121158

122159
## remove some really weird stuff FIND later out where the errors
123160
## are!! Database errors...
124161
table(TAX$phylum)
125162

163+
## should be fixed in database at some point... but now as a shortkut
164+
## here
126165
TAX <- TAX[!TAX$phylum%in%c("Cnidaria", "Porifera",
127166
"Bacillariophyta", ## maybe okay?
128167
"Eustigmatophyceae" ## maybe okay?
129168
) ,]
130169

131-
table(TAX$phylum)
170+
## ## now use only best hit
171+
## TAX <- TAX[!duplicated(TAX$query), ]
172+
## tail(TAX[order(TAX$amplicon), ])
173+
174+
## table(TAX$phylum)
132175

133176
### Summarizing by class
134177
foo <- merge(TAX, OTUs, by.x="query", by.y=0)
@@ -137,18 +180,22 @@ foobar <- foobar[order(rowSums(foobar), decreasing=TRUE), ]
137180
foobar <- foobar[!rownames(foobar)%in%c("", "undef"), ]
138181
foobar <- foobar[, !grepl("H2O|Argave|Wolf", colnames(foobar))]
139182

140-
foobar <- foobar[rowSums(foobar)>116, ]
141-
142183
mean.columns <- function(x){
143184
reps <- as.factor(gsub("_S\\d+$", "", colnames(x)))
144185
y <- do.call(rbind, by(t(x), reps, colMeans))
145186
t(y)
146187
}
147188

189+
## mean between replicates
148190
baz <- mean.columns(foobar)
191+
## removing stuff with very low support from only one replicate
149192
baz[baz<1] <- 0
150193

151-
devSVG("figures/Hyena_class_sum_heat.svg", width=7, height=7)
194+
## remove lowly represented classes
195+
baz <- baz[rowSums(baz)>20, ]
196+
197+
198+
devSVG("figures/Hyena_class_sumNT_heat.svg", width=7, height=7)
152199
pheatmap(log10(baz+1),
153200
show_rownames=TRUE,
154201
show_colnames=FALSE,

R/Wolf_first_rough.R

-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ for(i in 1:nrow(OTUs)){
5252
OTUs[i, out.z(OTUs[i,])] <- 0
5353
}
5454

55-
## Maybe much harder just kill when away from the mean/median?
5655

5756
######## Analyse just the READ counts per amplicon and sample #####
5857
SUM.amp <- do.call(cbind, by(OTUs, amplicon, colSums))

README.md

+6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# AA_Metabarcoding
22

3+
An overview of a metabarcoding pipeline for multiple marker data from
4+
the Fluidigm Access Array. The present version is intended for a
5+
overview of the preliminary processing of data. Code can be reviewed
6+
in the /scripts and /R folders. The processing is not reproducible at
7+
the present version as raw data files cannot be accessed (yet).
8+
39
## Preprocessing
410

511
### stratify the data into amplicons and samples

0 commit comments

Comments
 (0)