-
Notifications
You must be signed in to change notification settings - Fork 189
/
Copy pathtransform_filter-methods.R
1211 lines (1195 loc) · 48.5 KB
/
transform_filter-methods.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
################################################################################
# Function to create subsampled dataset
# in which each sample has same number of total observations/counts/reads
# Note that the subsampling is random, so some noise is introduced making the
# relative abundances slightly different
################################################################################
#' Resample an OTU table such that all samples have the same library size.
#'
#' Please note that the authors of phyloseq do not advocate using this
#' as a normalization procedure, despite its recent popularity.
#' Our justifications for using alternative approaches to address
#' disparities in library sizes have been made available as
#' \href{http://dx.plos.org/10.1371/journal.pcbi.1003531}{an article in PLoS Computational Biology}.
#' See \code{\link{phyloseq_to_deseq2}} for a recommended alternative to rarefying
#' directly supported in the phyloseq package, as well as
#' \href{http://joey711.github.io/waste-not-supplemental/}{the supplemental materials for the PLoS-CB article}
#' and \href{http://joey711.github.io/phyloseq-extensions}{the phyloseq extensions repository on GitHub}.
#' Nevertheless, for comparison and demonstration, the rarefying procedure is implemented
#' here in good faith and with options we hope are useful.
#' This function uses the standard R \code{\link{sample}} function to
#' resample from the abundance values
#' in the \code{\link{otu_table}} component of the first argument,
#' \code{physeq}.
#' Often one of the major goals of this procedure is to achieve parity in
#' total number of counts between samples, as an alternative to other formal
#' normalization procedures, which is why a single value for the
#' \code{sample.size} is expected.
#' This kind of resampling can be performed with and without replacement,
#' with replacement being the more computationally-efficient, default setting.
#' See the \code{replace} parameter documentation for more details.
#' We recommended that you explicitly select a random number generator seed
#' before invoking this function, or, alternatively, that you
#' explicitly provide a single positive integer argument as \code{rngseed}.
#'
#' This approach is sometimes mistakenly called ``rarefaction'', which
#' \href{http://en.wikipedia.org/wiki/Rarefaction}{in physics refers to a form of wave decompression;}
#' but in this context, ecology, the term refers to a
#' \href{http://en.wikipedia.org/wiki/Rarefaction_(ecology)}{repeated sampling procedure to assess species richness},
#' first proposed in 1968 by Howard Sanders.
#' In contrast, the procedure implemented here is used as an \emph{ad hoc} means to
#' normalize microbiome counts that have
#' resulted from libraries of widely-differing sizes.
#' Here we have intentionally adopted an alternative
#' name, \code{rarefy}, that has also been used recently
#' to describe this process
#' and, to our knowledge, not previously used in ecology.
#'
#' Make sure to use \code{\link{set.seed}} for exactly-reproducible results
#' of the random subsampling.
#'
#' @param physeq (Required). A \code{\link{phyloseq-class}} object that you
#' want to trim/filter.
#'
#' @param sample.size (Optional). A single integer value equal to the number
#' of reads being simulated, also known as the depth,
#' and also equal to each value returned by \code{\link{sample_sums}}
#' on the output.
#'
#' @param rngseed (Optional). A single integer value passed to
#' \code{\link{set.seed}}, which is used to fix a seed for reproducibly
#' random number generation (in this case, reproducibly random subsampling).
#' The default value is \code{711}.
#' If set to \code{FALSE}, then no fiddling with the RNG seed is performed,
#' and it is up to the user to appropriately call \code{\link{set.seed}}
#' beforehand to achieve reproducible results.
#'
#' @param replace (Optional). Logical. Whether to sample with replacement
#' (\code{TRUE}) or without replacement (\code{FALSE}).
#' The default is with replacement (\code{replace=TRUE}).
#' Two implications to consider are that
#' (1) sampling with replacement is faster and more memory efficient
#' as currently implemented; and
#' (2), sampling with replacement means that there is a chance that the
#' number of reads for a given OTU in a given sample could be larger
#' than the original count value, as opposed to sampling without replacement
#' where the original count value is the maximum possible.
#' Prior to phyloseq package version number \code{1.5.20},
#' this parameter did not exist and sampling with replacement was the only
#' random subsampling implemented in the \code{rarefy_even_depth} function.
#' Note that this default behavior was selected for computational efficiency,
#' but differs from analogous functions in related packages
#' (e.g. subsampling in QIIME).
#'
#' @param trimOTUs (Optional). \code{\link{logical}(1)}.
#' Whether to trim OTUs
#' from the dataset that are no longer observed in any sample
#' (have a count of zero in every sample).
#' The number of OTUs trimmed, if any, is printed to
#' standard out as a reminder.
#'
#' @param verbose (Optional). Logical. Default is \code{TRUE}.
#' If \code{TRUE}, extra non-warning, non-error messages are printed
#' to standard out, describing steps in the rarefying process,
#' the OTUs and samples removed, etc. This can be useful the
#' first few times the function is executed, but can be set
#' to \code{FALSE} as-needed once behavior has been verified
#' as expected.
#'
#' @return An object of class \code{phyloseq}.
#' Only the \code{otu_table} component is modified.
#'
#' @seealso
#' \code{\link{sample}}
#'
#' \code{\link{set.seed}}
#'
#' @export
#'
#' @examples
#' # Test with esophagus dataset
#' data("esophagus")
#' esorepT = rarefy_even_depth(esophagus, replace=TRUE)
#' esorepF = rarefy_even_depth(esophagus, replace=FALSE)
#' sample_sums(esophagus)
#' sample_sums(esorepT)
#' sample_sums(esorepF)
#' ## NRun Manually: Too slow!
#' # data("GlobalPatterns")
#' # GPrepT = rarefy_even_depth(GlobalPatterns, 1E5, replace=TRUE)
#' ## Actually just this one is slow
#' # system.time(GPrepF <- rarefy_even_depth(GlobalPatterns, 1E5, replace=FALSE))
rarefy_even_depth <-
function(physeq,
sample.size = min(sample_sums(physeq)),
rngseed = FALSE,
replace = TRUE,
trimOTUs = TRUE,
verbose = TRUE) {
if ( !(is.integer(rngseed) || is.numeric(rngseed) || is.logical(rngseed)) || length(rngseed) > 1) {
stop("`rngseed` must be a single integer or logical value")
}
if ( !(is.logical(rngseed) && !rngseed) ) {
# Now call the set.seed using the value expected in phyloseq
set.seed(rngseed)
if (verbose) {
# Print to screen this value
message(
"`set.seed(", rngseed, ")` was used to initialize repeatable random subsampling.\n",
"Please record this for your records so others can reproduce.\n",
"Try `set.seed(", rngseed, "); .Random.seed` for the full vector.\n",
"..."
)
}
} else if (verbose) {
message(
"You set `rngseed` to FALSE. Make sure you've set & recorded\n",
" the random seed of your session for reproducibility.\n",
"See `?set.seed`\n",
"..."
)
}
# Make sure sample.size is of length 1.
if ( length(sample.size) > 1 ) {
warning("`sample.size` had more than one value. ",
"Using only the first. \n ... \n")
sample.size <- sample.size[1]
}
if ( !(is.integer(sample.size) || is.numeric(sample.size)) ) {
stop("`sample.size` must be a single integer value")
} else
if ( sample.size <= 0 ) {
stop("sample.size less than or equal to zero. ",
"Need positive sample size to work.")
}
# Instead of warning, expected behavior now is to prune samples
# that have fewer reads than `sample.size`
if ( min(sample_sums(physeq)) < sample.size ) {
rmsamples = sample_sums(physeq) < sample.size
if (verbose) {
n.rmsamples <- sum(rmsamples)
message(n.rmsamples, " samples removed",
"because they contained fewer reads than `sample.size`.",
"Up to first five removed samples are: \n",
paste(names(rmsamples[rmsamples])[1:min(5, n.rmsamples)], "\t"),
"...")
}
# Now done with notifying user of pruning, actually prune.
physeq = prune_samples(!rmsamples, physeq)
}
# apply through each sample, and replace
# instead of transposing the table twice :
# if taxa are rows use margin 2 (cols) else use margin 1 (rows)
newotu <- apply(physeq@otu_table, taxa_are_rows(physeq) + 1, rarefaction_subsample,
sample.size = sample.size, replace = replace)
newotu <- otu_table(newotu, TRUE)
# Add OTU names to the row indices
taxa_names(newotu) <- taxa_names(physeq@otu_table)
# transpose otu_table if taxa were columns
if(!taxa_are_rows(physeq)) newotu <- t(newotu)
# replace the otu_table
physeq@otu_table <- newotu
if (trimOTUs) {
# Check for and remove empty OTUs
# 1. Notify user of empty OTUs being cut.
# 2. Cut empty OTUs
rmtaxa = taxa_sums(physeq) <= 0
if ( sum(rmtaxa) > 0 ) {
if (verbose) {
message(sum(rmtaxa), "OTUs were removed because they are no longer \n",
"present in any sample after random subsampling\n")
message("...")
}
physeq = prune_taxa(!rmtaxa, physeq)
}
}
return(physeq)
}
################################################################################
# rarefaction subsample function, one sample
################################################################################
#' @keywords internal
# ------------------------------------------------------------------------
# This is a test
# x = sample(10, 10)
# x = 1:10
# sample.size = 50
#system.time(obsvec <- foreach(OTUi=1:length(x), times=x, .combine=c) %do% {rep(OTUi, times)})
# data("GlobalPatterns")
# sample.size = sample_sums(GlobalPatterns)[which.min(sample_sums(GlobalPatterns))]
# x = get_taxa(GlobalPatterns, which.max(sample_sums(GlobalPatterns)))
# ------------------------------------------------------------------------
rarefaction_subsample <- function(x, sample.size, replace = FALSE) {
# Perform the sub-sampling. Suppress warnings due to old R compat issue.
# Also, make sure to avoid errors from x summing to zero,
# and there are no observations to sample.
if (sum(x) > 0) {
# Protect against, and quickly return an empty vector,
# if x is already an empty count vector this part is skipped
subsample <- suppressWarnings(if (replace) {
# resample with replacement
sample(seq_along(x),
sample.size,
replace = TRUE,
prob = x)
} else {
# resample without replacement
# use `sample` for subsampling. Hope that obsvec doesn't overflow.
## obsvec being longer than .Machine$integer.max doesn't break it
## as long as there aren't more than .Machine$integer.max taxa,
## it shouldn't be a problem (and if it is, tabulate throws an error)
## could always add a check for length(x) < .Machine$integer.max
# Use seq_along to get the indexes of x,
# then rep.int to repeat each times its abundance
## (int vectors use less memory than list/data.frame)
obsvec <- rep.int(seq_along(x), x)
sample(obsvec,
sample.size,
replace = FALSE)
})
# Tabulate the results (these are already named by the order in `x`)
## `table` is a `tabulate` wrapper that remove the 0s we want,
## so using tabulate directly
rarvec <- tabulate(subsample)
# there can be 0s missing at the end though so, need padding
rarvec <- c(rarvec, rep(0, length(x) - length(rarvec)))
}
# Return abundance vector. Let replacement happen elsewhere.
return(rarvec)
}
###############################################################################
#' Agglomerate closely-related taxa using single-linkage clustering.
#'
#' All tips of the tree separated by a cophenetic distance smaller than
#' \code{h} will be agglomerated into one taxa using \code{\link{merge_taxa}}.
#'
#' Can be used to create a non-trivial OTU Table, if a phylogenetic tree is available.
#'
#' For now, a simple, ``greedy'', single-linkage clustering is used. In future releases
#' it should be possible to specify different clustering approaches available in \code{R},
#' in particular, complete-linkage clustering appears to be used more commonly for OTU
#' clustering applications.
#'
#' @param physeq (Required). A \code{\link{phyloseq-class}},
#' containing a phylogenetic tree.
#' Alternatively, a phylogenetic tree \code{\link[ape]{phylo}} will also work.
#'
#' @param h (Optional). Numeric scalar of the height where the tree should be cut.
#' This refers to the tree resulting from hierarchical clustering
#' of \code{\link[ape]{cophenetic.phylo}(phy_tree(physeq))},
#' not necessarily the original phylogenetic tree, \code{phy_tree(physeq)}.
#' Default value is \code{0.2}.
#' Note that this argument used to be named \code{speciationMinLength},
#' before this function/method was rewritten.
#'
#' @param hcfun (Optional). A function.
#' The (agglomerative, hierarchical) clustering function to use.
#' Good examples are
#' \code{\link[cluster]{agnes}} and \code{\link[stats]{hclust}}.
#' The default is \code{\link[cluster]{agnes}}.
#'
#' @param ... (Optional). Additional named arguments to pass
#' to \code{hcfun}.
#'
#' @return An instance of the \code{\link{phyloseq-class}}.
#' Or alternatively, a \code{\link{phylo}} object if the
#' \code{physeq} argument was just a tree.
#' In the expected-use case, the number of OTUs will be fewer
#' (see \code{\link{ntaxa}}),
#' after merging OTUs that are related enough to be called
#' the same OTU.
#'
#' @seealso
#'
#' \code{\link{merge_taxa}}
#'
#' \code{\link[cluster]{agnes}}
#'
#' \code{\link[stats]{hclust}}
#'
#' \code{\link[ape]{cophenetic.phylo}}
#'
#' \code{\link[ape]{phylo}}
#'
#' @importFrom cluster agnes
#'
#' @export
#'
#' @examples
#' data("esophagus")
#' # for speed
#' esophagus = prune_taxa(taxa_names(esophagus)[1:25], esophagus)
#' plot_tree(esophagus, label.tips="taxa_names", size="abundance", title="Before tip_glom()")
#' plot_tree(tip_glom(esophagus, h=0.2), label.tips="taxa_names", size="abundance", title="After tip_glom()")
tip_glom = function(physeq, h=0.2, hcfun=agnes, ...){
dd = as.dist(cophenetic.phylo(phy_tree(physeq)))
psclust = cutree(as.hclust(hcfun(dd, ...)), h=h)
cliques = levels(factor(psclust))[tapply(psclust, factor(psclust), function(x){length(x)>1})]
# For each clique, merge taxa in it...
for( i in cliques){
physeq = merge_taxa(physeq, eqtaxa=names(psclust)[psclust == i])
}
return(physeq)
}
################################################################################
################################################################################
#' Agglomerate taxa of the same type.
#'
#' This method merges species that have the same taxonomy at a certain
#' taxonomic rank.
#' Its approach is analogous to \code{\link{tip_glom}}, but uses categorical data
#' instead of a tree. In principal, other categorical data known for all taxa
#' could also be used in place of taxonomy,
#' but for the moment, this must be stored in the \code{taxonomyTable}
#' of the data. Also, columns/ranks to the right of the rank chosen to use
#' for agglomeration will be replaced with \code{NA},
#' because they should be meaningless following agglomeration.
#'
#' @usage tax_glom(physeq, taxrank=rank_names(physeq)[1], NArm=TRUE, bad_empty=c(NA, "", " ", "\t"))
#'
#' @param physeq (Required). \code{\link{phyloseq-class}} or \code{\link{otu_table}}.
#'
#' @param taxrank A character string specifying the taxonomic level
#' that you want to agglomerate over.
#' Should be among the results of \code{rank_names(physeq)}.
#' The default value is \code{rank_names(physeq)[1]},
#' which may agglomerate too broadly for a given experiment.
#' You are strongly encouraged to try different values for this argument.
#'
#' @param NArm (Optional). Logical, length equal to one. Default is \code{TRUE}.
#' CAUTION. The decision to prune (or not) taxa for which you lack categorical
#' data could have a large effect on downstream analysis. You may want to
#' re-compute your analysis under both conditions, or at least think carefully
#' about what the effect might be and the reasons explaining the absence of
#' information for certain taxa. In the case of taxonomy, it is often a result
#' of imprecision in taxonomic designation based on short phylogenetic sequences
#' and a patchy system of nomenclature. If this seems to be an issue for your
#' analysis, think about also trying the nomenclature-agnostic \code{\link{tip_glom}}
#' method if you have a phylogenetic tree available.
#'
#' @param bad_empty (Optional). Character vector. Default: \code{c(NA, "", " ", "\t")}.
#' Defines the bad/empty values
#' that should be ignored and/or considered unknown. They will be removed
#' from the internal agglomeration vector derived from the argument to \code{tax},
#' and therefore agglomeration will not combine taxa according to the presence
#' of these values in \code{tax}. Furthermore, the corresponding taxa can be
#' optionally pruned from the output if \code{NArm} is set to \code{TRUE}.
#'
#' @return A taxonomically-agglomerated, optionally-pruned, object with class matching
#' the class of \code{physeq}.
#'
#' @seealso
#' \code{\link{tip_glom}}
#'
#' \code{\link{prune_taxa}}
#'
#' \code{\link{merge_taxa}}
#'
#' @export
#'
#' @examples
#' # data(GlobalPatterns)
#' # ## print the available taxonomic ranks
#' # colnames(tax_table(GlobalPatterns))
#' # ## agglomerate at the Family taxonomic rank
#' # (x1 <- tax_glom(GlobalPatterns, taxrank="Family") )
#' # ## How many taxa before/after agglomeration?
#' # ntaxa(GlobalPatterns); ntaxa(x1)
#' # ## Look at enterotype dataset...
#' # data(enterotype)
#' # ## print the available taxonomic ranks. Shows only 1 rank available, not useful for tax_glom
#' # colnames(tax_table(enterotype))
tax_glom <- function(physeq, taxrank=rank_names(physeq)[1],
NArm=TRUE, bad_empty=c(NA, "", " ", "\t")){
# Error if tax_table slot is empty
if( is.null(access(physeq, "tax_table")) ){
stop("The tax_glom() function requires that physeq contain a taxonomyTable")
}
# Error if bad taxrank
if( !taxrank[1] %in% rank_names(physeq) ){
stop("Bad taxrank argument. Must be among the values of rank_names(physeq)")
}
# Make a vector from the taxonomic data.
CN <- which( rank_names(physeq) %in% taxrank[1] )
tax <- as(access(physeq, "tax_table"), "matrix")[, CN]
# if NArm is TRUE, remove the empty, white-space, NA values from
if( NArm ){
keep_species <- names(tax)[ !(tax %in% bad_empty) ]
physeq <- prune_taxa(keep_species, physeq)
}
# Concatenate data up to the taxrank column, use this for agglomeration
tax <- as(access(physeq, "tax_table"), "matrix")[, 1:CN, drop=FALSE]
tax <- apply(tax, 1, function(i){paste(i, sep=";_;", collapse=";_;")})
# Remove NAs and useless from the vector/factor for looping.
# This does not remove the taxa that have an unknown (NA)
# taxonomic designation at this particular taxonomic rank.
tax <- tax[ !(tax %in% bad_empty) ]
# Define the OTU cliques to loop through
spCliques <- tapply(names(tax), factor(tax), list)
# Successively merge taxa in physeq.
for( i in names(spCliques)){
physeq <- merge_taxa(physeq, spCliques[[i]])
}
# "Empty" the values to the right of the rank, using NA_character_.
if( CN < length(rank_names(physeq)) ){
tax_table(physeq)[, -(1:CN)] <- NA_character_
}
# Return.
return(physeq)
}
################################################################################
################################################################################
#' Prune unwanted OTUs / taxa from a phylogenetic object.
#'
#' An S4 Generic method for removing (pruning) unwanted OTUs/taxa from phylogenetic
#' objects, including phylo-class trees, as well as native phyloseq package
#' objects. This is particularly useful for pruning a phyloseq object that has
#' more than one component that describes OTUs.
#' Credit: the \code{phylo}-class version is adapted from
#' \href{http://cran.at.r-project.org/web/packages/picante/index.html}{prune.sample}.
#'
#' @usage prune_taxa(taxa, x)
#'
#' @param taxa (Required). A character vector of the taxa in object x that you want to
#' keep -- OR alternatively -- a logical vector where the kept taxa are TRUE, and length
#' is equal to the number of taxa in object x. If \code{taxa} is a named
#' logical, the taxa retained are based on those names. Make sure they are
#' compatible with the \code{taxa_names} of the object you are modifying (\code{x}).
#'
#' @param x (Required). A phylogenetic object, including \code{phylo} trees,
#' as well as all phyloseq classes that represent taxa. If the function
#' \code{\link{taxa_names}} returns a non-\code{NULL} value, then your object
#' can be pruned by this function.
#'
#' @return The class of the object returned by \code{prune_taxa} matches
#' the class of the argument, \code{x}.
#'
#' @seealso
#'
#' \code{\link{prune_samples}}
#'
#' \href{http://cran.at.r-project.org/web/packages/picante/index.html}{prune.sample}
#'
#' @rdname prune_taxa-methods
#' @export
#' @examples
#' data("esophagus")
#' esophagus
#' plot(sort(taxa_sums(esophagus), TRUE), type="h", ylim=c(0, 50))
#' x1 = prune_taxa(taxa_sums(esophagus) > 10, esophagus)
#' x2 = prune_taxa(names(sort(taxa_sums(esophagus), TRUE))[1:9], esophagus)
#' identical(x1, x2)
setGeneric("prune_taxa", function(taxa, x) standardGeneric("prune_taxa"))
#' @aliases prune_taxa,NULL,ANY-method
#' @rdname prune_taxa-methods
setMethod("prune_taxa", signature("NULL", "ANY"), function(taxa, x){
return(x)
})
# Any prune_taxa call w/ signature starting with a logical
# converts the logical to a character vector, and then dispatches
# to more specific method.
#' @aliases prune_taxa,logical,ANY-method
#' @rdname prune_taxa-methods
setMethod("prune_taxa", signature("logical", "ANY"), function(taxa, x){
# Check that logical has same length as ntaxa, stop if not.
if( !identical(length(taxa), ntaxa(x)) ){
stop("logical argument to taxa is wrong length. Should equal ntaxa(x)")
} else {
# Pass on to names-based prune_taxa method
return( prune_taxa(taxa_names(x)[taxa], x) )
}
})
#' @importFrom ape drop.tip
#' @aliases prune_taxa,character,phylo-method
#' @rdname prune_taxa-methods
setMethod("prune_taxa", signature("character", "phylo"), function(taxa, x){
if( length(taxa) <= 1 ){
# Can't have a tree with 1 or fewer tips
warning("prune_taxa attempted to reduce tree to 1 or fewer tips.\n tree replaced with NULL.")
return(NULL)
} else if( setequal(taxa, taxa_names(x)) ){
return(x)
} else {
return( drop.tip(x, setdiff(taxa_names(x), taxa)) )
}
})
#' @aliases prune_taxa,character,otu_table-method
#' @rdname prune_taxa-methods
setMethod("prune_taxa", signature("character", "otu_table"), function(taxa, x){
if( setequal(taxa, taxa_names(x)) ){
return(x)
} else {
taxa = intersect( taxa, taxa_names(x) )
if( taxa_are_rows(x) ){
return(x[taxa, , drop=FALSE])
} else {
return(x[, taxa, drop=FALSE])
}
}
})
#' @aliases prune_taxa,character,sample_data-method
#' @rdname prune_taxa-methods
setMethod("prune_taxa", signature("character", "sample_data"), function(taxa, x){
return(x)
})
#' @aliases prune_taxa,character,phyloseq-method
#' @rdname prune_taxa-methods
setMethod("prune_taxa", signature("character", "phyloseq"), function(taxa, x){
# Re-define `taxa` as the intersection of OTU names for each component AND `taxa`
taxa = intersect(intersect_taxa(x), taxa)
# Now prune them all.
# All phyloseq objects have an otu_table slot, no need to test for existence.
x@otu_table = prune_taxa(taxa, otu_table(x))
# Test if slot is present. If so, perform the component prune.
if( !is.null(x@tax_table) ){
x@tax_table = prune_taxa(taxa, tax_table(x))
}
if( !is.null(x@phy_tree) ){
x@phy_tree = prune_taxa(taxa, phy_tree(x))
}
if( !is.null(x@refseq) ){
x@refseq = prune_taxa(taxa, refseq(x))
}
# Force index order after pruning to be the same,
# according to the same rules as in the constructor, phyloseq()
x = index_reorder(x, index_type="taxa")
return(x)
})
#' @aliases prune_taxa,character,taxonomyTable-method
#' @rdname prune_taxa-methods
setMethod("prune_taxa", signature("character", "taxonomyTable"), function(taxa, x){
if( setequal(taxa, taxa_names(x)) ){
return(x)
} else {
taxa = intersect( taxa, taxa_names(x) )
return( x[taxa, , drop=FALSE] )
}
})
#' @importClassesFrom Biostrings XStringSet
#' @aliases prune_taxa,character,XStringSet-method
#' @rdname prune_taxa-methods
setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x){
if( setequal(taxa, taxa_names(x)) ){
# Nothing to do, return x as-is.
return(x)
} else if( length(intersect(taxa, taxa_names(x))) == 0 ){
# Informative error if intersection is zero.
stop("prune_taxa,XStringSet: taxa and taxa_names(x) do not overlap.")
} else {
# Pop the OTUs that are not in `taxa`, without reordering.
return(x[-which(!taxa_names(x) %in% taxa)])
}
})
################################################################################
################################################################################
#' Define a subset of samples to keep in a phyloseq object.
#'
#' An S4 Generic method for pruning/filtering unwanted samples
#' by defining those you want to keep.
#'
#' @usage prune_samples(samples, x)
#'
#' @param samples (Required). A character vector of the samples in object x that you want to
#' keep -- OR alternatively -- a logical vector where the kept samples are TRUE, and length
#' is equal to the number of samples in object x. If \code{samples} is a named
#' logical, the samples retained is based on those names. Make sure they are
#' compatible with the \code{sample_names} of the object you are modifying (\code{x}).
#'
#' @param x A phyloseq object.
#'
#' @return The class of the object returned by \code{prune_samples} matches
#' the class of the phyloseq object, \code{x}.
#'
#' @seealso \code{\link{subset_samples}}
#'
#' @rdname prune_samples-methods
#' @docType methods
#' @export
#' @examples
#' data(GlobalPatterns)
#' # Subset to just the Chlamydiae phylum.
#' GP.chl <- subset_taxa(GlobalPatterns, Phylum=="Chlamydiae")
#' # Remove the samples that have less than 20 total reads from Chlamydiae
#' GP.chl <- prune_samples(sample_sums(GP.chl)>=20, GP.chl)
#' # (p <- plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="abundance"))
setGeneric("prune_samples", function(samples, x) standardGeneric("prune_samples"))
#' @aliases prune_samples,character,otu_table-method
#' @rdname prune_samples-methods
setMethod("prune_samples", signature("character", "otu_table"), function(samples, x){
if( setequal(samples, sample_names(x)) ){
# If the sets of `samples` and sample_names are the same, return as-is.
return(x)
} else {
samples = intersect(samples, sample_names(x))
if( taxa_are_rows(x) ){
return( x[, samples] )
} else {
return( x[samples, ] )
}
}
})
#' @aliases prune_samples,character,sample_data-method
#' @rdname prune_samples-methods
setMethod("prune_samples", signature("character", "sample_data"), function(samples, x){
if( setequal(samples, sample_names(x)) ){
# If the sets of `samples` and sample_names are the same, return as-is.
return(x)
} else {
samples = intersect(samples, sample_names(x))
return(x[samples, ])
}
})
#' @aliases prune_samples,character,phyloseq-method
#' @rdname prune_samples-methods
setMethod("prune_samples", signature("character", "phyloseq"), function(samples, x){
# Re-define `samples` as the intersection of samples names for each component AND `samples`
samples = intersect(intersect_samples(x), samples)
# Now prune each component.
# All phyloseq objects have an otu_table slot, no need to test for existence.
x@otu_table = prune_samples(samples, otu_table(x))
if( !is.null(x@sam_data) ){
# protect missing sample_data component. Don't need to prune if empty
x@sam_data = prune_samples(samples, sample_data(x))
}
# Force sample index order after pruning to be the same,
# according to the same rules as in the constructor, phyloseq()
x = index_reorder(x, index_type="samples")
return(x)
})
# A logical should specify the samples to keep, or not. Have same length as nsamples(x)
#' @aliases prune_samples,logical,ANY-method
#' @rdname prune_samples-methods
setMethod("prune_samples", signature("logical", "ANY"), function(samples, x){
# Check that logical has same length as nsamples, stop if not.
if( !identical(length(samples), nsamples(x)) ){
stop("logical argument to samples is wrong length. Should equal nsamples(x)")
} else {
# Pass on to names-based prune_samples method
return( prune_samples(sample_names(x)[samples], x) )
}
})
################################################################################
#' Thresholded rank transformation.
#'
#' The lowest \code{thresh} values in \code{x} all get the value 'thresh'.
#'
#' @usage threshrank(x, thresh, keep0s=FALSE, ...)
#'
#' @param x (Required). Numeric vector to transform.
#' @param thresh A single numeric value giving the threshold.
#' @param keep0s A logical determining whether 0's in \code{x} should remain
#' a zero-value in the output. If FALSE, zeros are treated as any other value.
#' @param ... Further arguments passes to the \code{\link{rank}} function.
#'
#' @return A ranked, (optionally) thresholded numeric vector with length equal to
#' \code{x}. Default arguments to \code{rank} are used, unless provided as
#' additional arguments.
#'
#' @seealso \code{\link{transform_sample_counts}}, \code{\link{rank}}, \code{\link{threshrankfun}}
#' @export
#' @examples #
#' (a_vector <- sample(0:10, 100, TRUE))
#' threshrank(a_vector, 5, keep0s=TRUE)
#' data(GlobalPatterns)
#' GP <- GlobalPatterns
#' ## These three approaches result in identical otu_table
#' (x1 <- transform_sample_counts( otu_table(GP), threshrankfun(500)) )
#' (x2 <- otu_table(apply(otu_table(GP), 2, threshrankfun(500)), taxa_are_rows(GP)) )
#' identical(x1, x2)
#' (x3 <- otu_table(apply(otu_table(GP), 2, threshrank, thresh=500), taxa_are_rows(GP)) )
#' identical(x1, x3)
threshrank <- function(x, thresh, keep0s=FALSE, ...){
if( keep0s ){ index0 <- which(x == 0) }
x <- rank(x, ...)
thresh <- thresh[1]
x[x<thresh] <- thresh
if( keep0s ){ x[index0] <- 0 }
return(x)
}
####################################################################################
#' A closure version of the \code{threshrank} function.
#'
#' Takes the same arguments as \code{\link{threshrank}}, except for \code{x},
#' because the output is a single-argument function rather than a rank-transformed numeric.
#' This is useful for higher-order functions that require a single-argument function as input,
#' like \code{\link{transform_sample_counts}}.
#'
#' @usage threshrankfun(thresh, keep0s=FALSE, ...)
#'
#' @param thresh A single numeric value giving the threshold.
#' @param keep0s A logical determining whether 0's in \code{x} should remain
#' a zero-value in the output. If FALSE, zeros are treated as any other value.
#' @param ... Further arguments passes to the \code{\link{rank}} function.
#'
#' @return A single-argument function with the options to \code{\link{threshrank}} set.
#'
#' @seealso \code{\link{transform_sample_counts}}, \code{\link{threshrankfun}},
#' \code{\link{threshrank}}
#' @export
#' @examples
#' data(esophagus)
#' x1 = transform_sample_counts(esophagus, threshrankfun(50))
#' otu_table(x1)
#' x2 = transform_sample_counts(esophagus, rank)
#' otu_table(x2)
#' identical(x1, x2)
threshrankfun <- function(thresh, keep0s=FALSE, ...){
function(x){
threshrank(x, thresh, keep0s=FALSE, ...)
}
}
################################################################################
#' Transpose \code{\link{otu_table-class}} or \code{\link{phyloseq-class}}
#'
#' Extends the base transpose method, \code{\link[base]{t}}.
#'
#' @usage t(x)
#'
#' @param x An \code{otu_table} or \code{\link{phyloseq-class}}.
#'
#' @return The class of the object returned by \code{t} matches
#' the class of the argument, \code{x}. The \code{otu_table} is
#' transposed, and \code{\link{taxa_are_rows}} value is toggled.
#'
#' @name t
#' @rdname transpose-methods
#' @docType methods
#' @export
#' @examples
#' data(GlobalPatterns)
#' otu_table(GlobalPatterns)
#' t( otu_table(GlobalPatterns) )
setGeneric("t")
#' @aliases t,otu_table-method
#' @rdname transpose-methods
setMethod("t", signature("otu_table"), function(x){
#new("otu_table", t([email protected]), taxa_are_rows = (!taxa_are_rows(x)))
x <- otu_table( t(as(x, "matrix")), taxa_are_rows=(!taxa_are_rows(x)) )
return(x)
})
################################################################################
#' @aliases t,phyloseq-method
#' @rdname transpose-methods
setMethod("t", signature("phyloseq"), function(x){
x@otu_table <- t( otu_table(x) )
return(x)
})
################################################################################
#' Transform abundance data in an \code{otu_table}, sample-by-sample.
#'
#' This function transforms the sample counts of a taxa
#' abundance matrix according to a user-provided function.
#' The counts of each sample will be transformed individually. No sample-sample
#' interaction/comparison is possible by this method.
#'
#' @usage transform_sample_counts(physeq, fun, ...)
#'
#' @param physeq (Required). \code{\link{phyloseq-class}} of \code{\link{otu_table-class}}.
#'
#' @param fun (Required). A single-argument function that will be applied
#' to the abundance counts of each sample.
#' Can be an anonymous \code{\link[base]{function}}.
#'
#' @param ... (Optional). Additional, optionally-named, arguments passed to
#' \code{fun} during transformation of abundance data.
#'
#' @return A transformed \code{otu_table} -- or \code{phyloseq} object with its
#' transformed \code{otu_table}.
#' In general, trimming is not expected by this
#' method, so it is suggested that the user provide only functions that return
#' a full-length vector. Filtering/trimming can follow, for which the
#' \code{\link{genefilter_sample}} and \code{\link{prune_taxa}} functions
#' are suggested.
#'
#' @seealso \code{\link{threshrankfun}}, \code{\link{rank}}, \code{\link{log}}
#'
#' @docType methods
#' @aliases transform_sample_counts transformSampleCounts
#' @rdname transformcounts
#' @export
#'
#' @examples #
#' data(esophagus)
#' x1 = transform_sample_counts(esophagus, threshrankfun(50))
#' head(otu_table(x1), 10)
#' x2 = transform_sample_counts(esophagus, rank)
#' head(otu_table(x2), 10)
#' identical(x1, x2)
#' x3 = otu_table(esophagus) + 5
#' x3 = transform_sample_counts(x3, log)
#' head(otu_table(x3), 10)
#' x4 = transform_sample_counts(esophagus, function(x) round(x^2.2, 0))
#' head(otu_table(x4), 10)
transform_sample_counts <- function(physeq, fun, ...){
# Test the user-provided function returns a vector of the same length as input.
if( !identical(length(fun(1:10)), 10L) ){stop("`fun` not valid function.")}
# Check orientation, transpose if-needed to make apply work properly.
if( taxa_are_rows(physeq) ){
newphyseq = apply(as(otu_table(physeq), "matrix"), 2, fun, ...)
if( identical(ntaxa(physeq), 1L) ){
# Fix the dropped index when only 1 OTU.
newphyseq <- matrix(newphyseq, 1L, nsamples(physeq), TRUE,
list(taxa_names(physeq), sample_names(physeq)))
}
} else {
newphyseq = apply(t(as(otu_table(physeq), "matrix")), 2, fun, ...)
if( identical(ntaxa(physeq), 1L) ){
# Fix the dropped index when only 1 OTU.
newphyseq <- matrix(newphyseq, 1L, nsamples(physeq), TRUE,
list(taxa_names(physeq), sample_names(physeq)))
}
newphyseq = t(newphyseq)
}
# Check that original and new dimensions agree. Error if not.
if( !identical(dim(newphyseq), dim(otu_table(physeq))) ){
stop("Dimensions of OTU table change after apply-ing function. \n",
" Please check both function and table")
}
otu_table(physeq) <- otu_table(newphyseq, taxa_are_rows=taxa_are_rows(physeq))
return(physeq)
}
####################################################################################
#' @rdname transformcounts
#' @export
transformSampleCounts <- transform_sample_counts
####################################################################################
############################################################
#' Filter OTUs with arbitrary function, sample-wise.
#'
#' A general OTU trimming function for selecting OTUs that satisfy
#' some criteria within the distribution of each sample, and then
#' also an additional criteria for number of samples that must pass.
#' This is a genefilter-like function that only considers sample-wise
#' criteria. The number of acceptable samples is used
#' as the final criteria (set by the argument \code{A})
#' to determine whether or not the taxa should
#' be retained (\code{TRUE}) or not (\code{FALSE}). Just like with genefilter, a
#' logical having length equal to nrow()/\code{\link{ntaxa}} is returned, indicating which
#' should be kept. This output can be provided
#' directly to OTU trimming function, \code{\link{prune_taxa}}.
#' By contrast, \code{\link[genefilter]{genefilter}},
#' of the genefilter package in Bioconductor,
#' works only on the rows of a matrix. Note that, because \code{\link{otu_table-class}}
#' inherits directly from the \code{\link{matrix-class}}, an unmodified
#' otu_table can be provided to \code{genefilter}, but be mindful of the orientation
#' of the otu_table (use \code{\link{taxa_are_rows}}),
#' and transpose (\code{\link[phyloseq]{t}}) if needed.
#'
#' @usage genefilter_sample(X, flist, A=1)
#'
#' @param X The object that needs trimming. Can be matrix, otu_table, or higher-
#' order phyloseq classes that contain an otu_table.
#'
#' @param flist An enclosure object, typically created with \code{\link{filterfun_sample}}
#'
#' @param A An integer. The number of samples in which a taxa / OTUs passed the filter
#' for it to be labeled TRUE in the output logical vector.
#'
#' @return A logical vector with names equal to taxa_names (or rownames, if matrix).
#'
#' @seealso \code{\link[genefilter]{genefilter}}, \code{\link{filterfun_sample}},
#' \code{\link[phyloseq]{t}},
#' \code{\link{prune_taxa}}
#' @keywords agglomerate OTU cluster tree
#'
#' @rdname genefilter_sample-methods
#' @docType methods
#' @export
#'
#' @examples #
#' ## testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
#' ## f1 <- filterfun_sample(topk(2))
#' ## wh1 <- genefilter_sample(testOTU, f1, A=2)
#' ## wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
#' ## prune_taxa(wh1, testOTU)
#' ## prune_taxa(wh2, testOTU)
#' ##
#' ## tax_table1 <- tax_table(matrix("abc", 5, 5))
#' ## prune_taxa(wh1, tax_table1)
#' ## prune_taxa(wh2, tax_table1)
setGeneric("genefilter_sample", function(X, flist, A=1) standardGeneric("genefilter_sample"))
#' @rdname genefilter_sample-methods
#' @aliases genefilter_sample,matrix-method
setMethod("genefilter_sample", signature("matrix"), function(X, flist, A=1){
TFmat = apply(X, 2, flist)
apply(TFmat, 1, function(x, A){sum(x) >= A}, A)
})
#' @rdname genefilter_sample-methods
#' @aliases genefilter_sample,otu_table-method
setMethod("genefilter_sample", signature("otu_table"), function(X, flist, A=1){
if( taxa_are_rows(X) ){
genefilter_sample( as(X, "matrix"), flist, A)
} else {
genefilter_sample( t(as(X, "matrix")), flist, A)
}
})
#' @rdname genefilter_sample-methods
#' @aliases genefilter_sample,phyloseq-method
setMethod("genefilter_sample", signature("phyloseq"), function(X, flist, A=1){
genefilter_sample(otu_table(X), flist, A)
})
################################################################################
#' A sample-wise filter function builder
#' analogous to \code{\link[genefilter]{filterfun}}.
#'
#' See the \code{\link[genefilter]{filterfun}}, from the Bioconductor repository,
#' for a taxa-/gene-wise filter (and further examples).
#'
#' @usage filterfun_sample(...)
#'
#' @param ... A comma-separated list of functions.
#'
#' @return An enclosure (function) that itself will return a logical vector,
#' according to the
#' functions provided in the argument list, evaluated in order. The output of
#' filterfun_sample is appropriate for the `flist' argument to the
#' genefilter_sample method.
#'
#' @export
#' @seealso \code{\link[genefilter]{filterfun}}, \code{\link{genefilter_sample}}
#' @examples
#' # Use simulated abundance matrix
#' set.seed(711)
#' testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE)
#' f1 <- filterfun_sample(topk(2))
#' wh1 <- genefilter_sample(testOTU, f1, A=2)
#' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE)
#' prune_taxa(wh1, testOTU)
#' prune_taxa(wh2, testOTU)
filterfun_sample = function(...){
flist <- list(...)
if( length(flist) == 1 && is.list(flist[[1]])) { flist <- flist[[1]] }
f = function(x){
# initialize fval (a logical vector)
fun = flist[[1]]
fval = fun(x)
# check the remaining functions. Compare & logic, element-wise, each loop.
for(fun in flist[-1]){
fval = fval & fun(x)
}
return(fval)
}
class(f) <- "filterfun"
return(f)
}
################################################################################
#' Filter taxa based on across-sample OTU abundance criteria
#'
#' This function is directly analogous to the
#' \code{\link[genefilter]{genefilter}} function for microarray filtering,
#' but is used for filtering OTUs from phyloseq objects.
#' It applies an arbitrary set of functions ---
#' as a function list, for instance, created by \code{\link[genefilter]{filterfun}} ---
#' as across-sample criteria, one OTU at a time.
#' It takes as input a phyloseq object,