-
Notifications
You must be signed in to change notification settings - Fork 189
/
Copy pathextend_vegan.R
274 lines (269 loc) · 10.8 KB
/
extend_vegan.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
################################################################################
# Define S3 methods for scores (originally defined by vegan-package)
# to work for other ordination results
# vegan:::scores.default
################################################################################
# pcoa-class, from pcoa{ape}
#' @importFrom vegan wascores
#' @importFrom vegan scores
#' @keywords internal
scores.pcoa <- function(x, choices=NULL, display="sites", physeq=NULL, ...){
if(is.null(choices)){
choices <- colnames(x$vectors)
}
co = list(sites = x$vectors[, choices])
if( "species" %in% display ){
if(is.null(otu_table(physeq, errorIfNULL = FALSE))){
warning("scores.pcoa: Failed to access OTU table from `physeq` argument, \n
needed for weighted average of OTU/taxa/species points in MDS/PCoA.")
} else {
# MDS/PCoA only provides coordinates of the elements in the
# distance matrix, usually sites/samples, so species (etc.)
# This means we need to use the weighted-average as there is
# no corresponding axes from the ordination directly.
co$species <- wascores(x$vectors[, choices], w = veganifyOTU(physeq))
}
}
co <- co[display]
if(length(co) < 2L){
# Unlist
co <- co[[display]]
}
return(co)
}
################################################################################
# DPCoA management
################################################################################
#' @importFrom vegan scores
#' @keywords internal
get_dpcoa_species_coords = function(x, physeq=NULL){
# Grab coordinates from the dpcoa object
coords = x$dls
# ade4 mangles the element names using `make.names` conventions in base R
# Replace them in `coords`
if(is.null(taxa_names(physeq))){
warning("scores.dpcoa: Failed to access `taxa_names` from `physeq` argument, \n
needed to ensure correct mapping of OTU/taxa/species points in DPCoA.")
} else {
# if the names are available, use them
# by mapping the same variable-name conversion that ade4 would have used.
taxnames = taxa_names(physeq)
names(taxnames) <- make.names(taxnames)
rownames(coords) <- taxnames[rownames(coords)]
}
return(coords)
}
#' @importFrom vegan scores
#' @keywords internal
get_dpcoa_sites_coords = function(x, physeq=NULL){
# Grab coordinates from the dpcoa object
coords = x$li
# ade4 mangles the element names using `make.names` conventions in base R
# Replace them in `coords`
if(is.null(sample_names(physeq))){
warning("scores.dpcoa: Failed to access `sample_names` from `physeq` argument, \n
needed to ensure correct mapping of site/sample/library points in DPCoA.")
} else {
# if the names are available, use them
# by mapping the same variable-name conversion that ade4 would have used.
samplenames = sample_names(physeq)
names(samplenames) <- make.names(samplenames)
rownames(coords) <- samplenames[rownames(coords)]
}
return(coords)
}
# dpcoa-class, from ade4
#' @importFrom vegan scores
#' @keywords internal
scores.dpcoa <- function(x, choices=NULL, display="sites", physeq=NULL, ...){
# x = ordination
# display = "species"
coords = NULL
# `display` must be either "sites" or "species", per vegan-package convention.
coords <- switch(EXPR = display,
species = get_dpcoa_species_coords(x, physeq),
sites = get_dpcoa_sites_coords(x, physeq))
# If no choices selection, take all dimensions/columns
if(is.null(choices)){
choices <- 1:ncol(coords)
}
return( coords[, choices, drop=FALSE] )
}
################################################################################
# Extend vegdist for phyloseq classes
################################################################################
# \code{\link[vegan]{vegdist}} wrapper for phyloseq classes
#
# Trivially-extended S4 method from the \code{\link[vegan]{vegdist}} function,
# such that S4 classes from the \code{\link{phyloseq-package}} are properly
# handled / accessed. All parameters passed on to \code{\link[vegan]{vegdist}}
# verbatim.
#
# @seealso \code{\link[vegan]{vegdist}}
# @rdname vegdist-methods
# @docType methods
# @aliases vegdist
#
# @examples
# data(esophagus)
# vegdist(esophagus, "jaccard")
#' @importFrom vegan vegdist
#' @keywords internal
setGeneric("vegdist")
################################################################################
# @aliases vegdist,otu_table-method
# @rdname vegdist-methods
#' @importFrom vegan vegdist
setMethod("vegdist", "otu_table", function(x, method = "bray", binary = FALSE,
diag = FALSE, upper = FALSE, na.rm = FALSE, ...){
# Make sure in sample-by-species orientation
if( taxa_are_rows(x) ){x <- t(x)}
# Convert to simple matrix
x <- as(x, "matrix")
# pass to standard method (compiled C)
vegdist(x, method, binary, diag, upper, na.rm, ...)
})
################################################################################
# @aliases vegdist,phyloseq-method
# @rdname vegdist-methods
setMethod("vegdist", "phyloseq", function(x, method = "bray", binary = FALSE,
diag = FALSE, upper = FALSE, na.rm = FALSE, ...){
# Simply access the otu_table
x <- otu_table(x)
vegdist(x, method, binary, diag, upper, na.rm, ...)
})
################################################################################
#' Summarize alpha diversity
#'
#' Performs a number of standard alpha diversity estimates,
#' and returns the results as a \code{data.frame}.
#' Strictly speaking, this function is not only estimating richness,
#' despite its name.
#' It can operate on the cumulative population of all
#' samples in the dataset, or by repeating the richness estimates for each
#' sample individually.
#' NOTE: You must use untrimmed datasets
#' for meaningful results, as these estimates (and even the ``observed'' richness)
#' are highly dependent on the number of singletons. You can always trim the data
#' later on if needed, just not before using this function.
#'
#' @param physeq (Required). \code{\link{phyloseq-class}}, or alternatively,
#' an \code{\link{otu_table-class}}. The data about which you want to estimate
#' the richness.
#'
#' @param split (Optional). Logical. Should a separate set of richness estimates
#' be performed for each sample? Or alternatively, pool all samples and
#' estimate richness of the entire set.
#'
#' @param measures (Optional). Default is \code{NULL}, meaning that
#' all available alpha-diversity measures will be included.
#' Alternatively, you can specify one or more measures
#' as a character vector of measure names.
#' Values must be among those supported:
#' \code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}.
#'
#' @return A \code{data.frame} of the richness estimates, and their standard error.
#'
#' @seealso
#' Check out the custom plotting function, \code{\link{plot_richness}},
#' for easily showing the results of different estimates,
#' with method-specific error-bars.
#' Also check out the internal functions borrowed from the \code{vegan} package:
#'
#' \code{\link[vegan]{estimateR}}
#'
#' \code{\link[vegan]{diversity}}
#'
#' \code{\link[vegan]{fisherfit}}
#'
#' @importFrom vegan estimateR
#' @importFrom vegan diversity
#' @importFrom vegan fisher.alpha
#' @export
#' @examples
#' ## There are many more interesting examples at the phyloseq online tutorials.
#' ## http://joey711.github.com/phyloseq/plot_richness-examples
#' data("esophagus")
#' # Default is all available measures
#' estimate_richness(esophagus)
#' # Specify just one:
#' estimate_richness(esophagus, measures="Observed")
#' # Specify a few:
#' estimate_richness(esophagus, measures=c("Observed", "InvSimpson", "Shannon", "Chao1"))
estimate_richness <- function(physeq, split=TRUE, measures=NULL){
if( !any(otu_table(physeq)==1) ){
# Check for singletons, and then warning if they are missing.
# These metrics only really meaningful if singletons are included.
warning(
"The data you have provided does not have\n",
"any singletons. This is highly suspicious. Results of richness\n",
"estimates (for example) are probably unreliable, or wrong, if you have already\n",
"trimmed low-abundance taxa from the data.\n",
"\n",
"We recommended that you find the un-trimmed data and retry."
)
}
# If we are not splitting sample-wise, sum the species. Else, enforce orientation.
if( !split ){
OTU <- taxa_sums(physeq)
} else if( split ){
OTU <- as(otu_table(physeq), "matrix")
if( taxa_are_rows(physeq) ){ OTU <- t(OTU) }
}
# Define renaming vector:
renamevec = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")
names(renamevec) <- c("S.obs", "S.chao1", "S.ACE", "shannon", "simpson", "invsimpson", "fisher")
# If measures was not explicitly provided (is NULL), set to all supported methods
if( is.null(measures) ){
measures = as.character(renamevec)
}
# Rename measures if they are in the old-style
if( any(measures %in% names(renamevec)) ){
measures[measures %in% names(renamevec)] <- renamevec[names(renamevec) %in% measures]
}
# Stop with error if no measures are supported
if( !any(measures %in% renamevec) ){
stop("None of the `measures` you provided are supported. Try default `NULL` instead.")
}
# Initialize to NULL
outlist = vector("list")
# Some standard diversity indices
estimRmeas = c("Chao1", "Observed", "ACE")
if( any(estimRmeas %in% measures) ){
outlist <- c(outlist, list(t(data.frame(estimateR(OTU)))))
}
if( "Shannon" %in% measures ){
outlist <- c(outlist, list(shannon = diversity(OTU, index="shannon")))
}
if( "Simpson" %in% measures ){
outlist <- c(outlist, list(simpson = diversity(OTU, index="simpson")))
}
if( "InvSimpson" %in% measures ){
outlist <- c(outlist, list(invsimpson = diversity(OTU, index="invsimpson")))
}
if( "Fisher" %in% measures ){
fisher = tryCatch(fisher.alpha(OTU),
warning=function(w){
warning("phyloseq::estimate_richness: Warning in fisher.alpha(). See `?fisher.fit` or ?`fisher.alpha`. Treat fisher results with caution")
suppressWarnings(fisher.alpha(OTU))
}
)
if(!is.null(dim(fisher))){
colnames(fisher)[1:2] <- c("Fisher", "se.fisher")
outlist <- c(outlist, list(fisher))
} else {
outlist <- c(outlist, Fisher=list(fisher))
}
}
out = do.call("cbind", outlist)
# Rename columns per renamevec
namechange = intersect(colnames(out), names(renamevec))
colnames(out)[colnames(out) %in% namechange] <- renamevec[namechange]
# Final prune to just those columns related to "measures". Use grep.
colkeep = sapply(paste0("(se\\.){0,}", measures), grep, colnames(out), ignore.case=TRUE)
out = out[, sort(unique(unlist(colkeep))), drop=FALSE]
# Make sure that you return a data.frame for reliable performance.
out <- as.data.frame(out)
return(out)
}
################################################################################