diff --git a/R/tree.R b/R/tree.R index 5cdc20d1..ba37998f 100755 --- a/R/tree.R +++ b/R/tree.R @@ -31,200 +31,284 @@ ## Functions ## ############### -########################### -## Approach 0 | FastTree2.0 -########################### -## !! FastTree will only work if there are unique sequence names!! -#' convertFA2Tree -#' -#' @param fa_path Path to the input FASTA alignment file (.fa). Default is the -#' path to "data/alns/pspa_snf7.fa". -#' @param tre_path Path to the output file where the generated tree (.tre) will -#' be saved. Default is the path to "data/alns/pspa_snf7.tre". -#' @param fasttree_path Path to the FastTree executable, which is used to -#' generate the phylogenetic tree. Default is "src/FastTree". -#' -#' @return No return value. The function generates a tree file (.tre) from the -#' input FASTA file. -#' @export -#' -#' @examples -#' \dontrun{ -#' convert_fa2tre(here("data/alns/pspa_snf7.fa"), -#' here("data/alns/pspa_snf7.tre"), -#' here("src/FastTree") -#' } -convertFA2Tree <- function(fa_path = here("data/alns/pspa_snf7.fa"), - tre_path = here("data/alns/pspa_snf7.tre"), - fasttree_path = here("src/FastTree")) { - # fa_path=here("data/alns/pspa_snf7.fa") - # tre_path=here("data/alns/pspa_snf7.tre") - # fasttree_path=here("src/FastTree") - print(fa_path) - system2( - command = fasttree_path, - args = paste(c(fa_path, ">", tre_path), - sep = "", collapse = " " - ) - ) - ################################ - ## Compiling FastTree.c on a Mac - ################################ - ## Other platforms? Check http://www.microbesonline.org/fasttree/ - # system(paste("gcc -DNO_SSE -O3 -finline-functions -funroll-loops -Wall -o", - # here("src/FastTree"), - # here("src/FastTree.c"), "-lm", collapse=" ")) -} -## Generate Trees for ALL fasta files in "data/alns" -#' convertAlignment2Trees -#' -#' @description -#' Generate Trees for ALL fasta files in "data/alns" -#' -#' @param aln_path Path to the directory containing all the alignment FASTA -#' files (.fa) for which trees will be generated. Default is "data/alns/". -#' -#' -#' @importFrom here here -#' @importFrom purrr pmap -#' @importFrom stringr str_replace_all -#' -#' @return No return value. The function generates tree files (.tre) for each -#' alignment file in the specified directory. -#' @export -#' -#' @examples -#' \dontrun{ -#' generate_trees(here("data/alns/")) -#' } -convertAlignment2Trees <- function(aln_path = here("data/alns/")) { - # finding all fasta alignment files - fa_filenames <- list.files(path = aln_path, pattern = "*.fa") - fa_paths <- paste0(aln_path, fa_filenames) - variable <- str_replace_all(basename(fa_filenames), - pattern = ".fa", replacement = "" - ) - - ## Using purrr::pmap to generate trees for each of the fa files! - fa2tre_args <- list( - fa_path = fa_paths, - tre_path = paste0(aln_path, variable, ".tre") - ) - pmap( - .l = fa2tre_args, .f = convertFA2Tree, - fasttree_path = here("src/FastTree") - ) -} - ############################## -## REFS: 1-4 +## Approach 0-4 ############ +#' Approach 0: !! FastTree will only work if there are unique sequence names!! #' createFA2Tree -#' +#' @author Klangina, Outreachy Applicant #' @author Janani Ravi, MolEcologist -#' @keywords phylogenetic tree, alignment, fasta +#' @keywords phylogenetic tree, alignment, fasta, FastTree #' @description -#' Generating phylogenetic tree from alignment file '.fa' +#' Generates a phylogenetic tree from an alignment file '.fa' using either FastTree or a detailed phylogenetic analysis approach. #' #' @param fa_file Character. Path to the alignment FASTA file (.fa) from which -#' the phylogenetic tree will be generated. Default is 'pspa_snf7.fa'. -#' @param out_file Path to the output file where the generated tree (.tre) will +#' the phylogenetic tree will be generated. Default is "data/alns/pspa_snf7.fa". +#' @param out_file Character. Path to the output file where the generated tree (.tre) will #' be saved. Default is "data/alns/pspa_snf7.tre". +#' @param enable_fast Logical. If TRUE, uses FastTree approach; if FALSE, uses detailed phylogenetic analysis. Default is TRUE. +#' @param fasttree_path Character. Path to the FastTree executable. Default is "src/FastTree". +#' @param format Character. Format of the input file. Default is "fasta". +#' @param seq_type Character. Type of sequences in the input file. Default is "AA" (amino acid). +#' @param dist_matrix Character. Distance matrix to be used. Default is "similarity". +#' @param subset_size Numeric. Number of sequences to subset for analysis. Default is 10. +#' @param model_test Character. Model to use for model testing. Default is "all". +#' @param dna_model Character. Model to use for DNA distance calculation. Default is "JC69". +#' @param ml_model Character. Model to use for maximum likelihood estimation. Default is "JC". +#' @param rearrangement Character. Type of tree rearrangement for ML optimization. Default is "stochastic". +#' @param bootstrap_reps Numeric. Number of bootstrap replicates. Default is 100. +#' @param bootstrap_p Numeric. P-value threshold for bootstrap support. Default is 50. +#' @param plot_type Character. Type of plot for bootstrap tree. Default is "p". +#' @param bootstrap_file Character. Path to save the bootstrap tree file. Default is "bootstrap_example.tre". #' #' @importFrom ape write.tree #' @importFrom phangorn bootstrap.pml dist.ml NJ modelTest phyDat plotBS pml pml.control pratchet optim.parsimony optim.pml read.phyDat upgma #' @importFrom seqinr dist.alignment read.alignment #' @importFrom stats logLik #' -#' @return No return value. The function generates a phylogenetic tree file -#' (.tre) based on different approaches like Neighbor Joining, UPGMA, and -#' Maximum Likelihood. +#' @return No return value. The function generates phylogenetic tree files +#' (.tre) based on either FastTree or detailed phylogenetic analysis approaches. +#' It also produces various plots and console outputs when using the detailed approach. #' @export #' -#' @details The alignment file would need two columns: 1. accession + -#' number and 2. alignment. The protein homolog accession to lineage mapping + -#' file should have +#' @details The function offers two main approaches for phylogenetic tree construction: +#' 1. FastTree approach: A quick method using the FastTree program. +#' 2. Detailed phylogenetic analysis: Includes model testing, distance matrix calculation, +#' UPGMA and NJ tree construction, parsimony searches, and maximum likelihood estimation +#' with bootstrapping. +#' The alignment file should be in FASTA format with sequences properly aligned. +#' When using FastTree, ensure that the FastTree executable is correctly installed and +#' the path is properly specified. #' -#' @note Please refer to the source code if you have alternate + -#' file formats and/or column names. +#' @note This function requires the ape, phangorn, and seqinr packages to be installed. +#' The detailed analysis approach performs computationally intensive operations and may +#' take a significant amount of time to run, especially for large datasets or with high +#' numbers of bootstrap replicates. FastTree approach is significantly faster but may +#' provide less detailed results. #' #' @examples #' \dontrun{ -#' generate_aln2tree("pspa_snf7.fa") +#' # Using FastTree approach +#' createFA2Tree("data/alns/my_alignment.fa", "data/trees/my_tree.tre", enable_fast = TRUE) +#' +#' # Using detailed phylogenetic analysis approach +#' createFA2Tree("data/alns/protein_align.fa", "data/trees/protein_tree.tre", +#' enable_fast = FALSE, subset_size = 20, bootstrap_reps = 500) #' } -createFA2Tree <- function(fa_file = "data/alns/pspa_snf7.fa", - out_file = "data/alns/pspa_snf7.tre") { +createFA2Tree <- function( + fa_file = "data/alns/pspa_snf7.fa", + out_file = "data/alns/pspa_snf7.tre", + enable_fast = TRUE, + fasttree_path = "src/FastTree", + format = "fasta", + seq_type = "AA", + dist_matrix = "similarity", + subset_size = 10, + model_test = "all", + dna_model = "JC69", + ml_model = "JC", + rearrangement = "stochastic", + bootstrap_reps = 100, + bootstrap_p = 50, + plot_type = "p", + bootstrap_file = "bootstrap_example.tre" +) { ## SAMPLE ARGS # fa_file="data/alns/pspa_snf7.fa" # out_file="data/alns/pspa_snf7.tre" + # fasttree_path="src/FastTree" - ########################### - ## Approach 1 - ########################### - ## https://a-little-book-of-r-for-bioinformatics.readthedocs.io/en/latest/src/chapter5.html - prot_aln <- read.alignment(file = fa_file, format = "fasta") - prot_aln$seq - prot_dist <- dist.alignment(prot_aln, matrix = "similarity") - prot_nj <- NJ(prot_dist) - prot_nj_tree <- plot(prot_nj, main = "Neighbor Joining") - write.tree(phy = prot_nj_tree, file = tre_path) + if (enable_fast) { + # FastTree approach + print(fa_file) + system2( + command = fasttree_path, + args = paste(c(fa_file, ">", out_file), + sep = "", collapse = " " + ) + ) + ################################ + ## Compiling FastTree.c on a Mac + ################################ + ## Other platforms? Check http://www.microbesonline.org/fasttree/ + # system(paste("gcc -DNO_SSE -O3 -finline-functions -funroll-loops -Wall -o", + # here("src/FastTree"), + # here("src/FastTree.c"), "-lm", collapse=" ")) + } else { + # Detailed phylogenetic analysis approach + ########################### + ## Approach 1 + ########################### + ## https://a-little-book-of-r-for-bioinformatics.readthedocs.io/en/latest/src/chapter5.html + prot_aln <- read.alignment(file = fa_file, format = format) + prot_aln$seq + prot_dist <- dist.alignment(prot_aln, matrix = dist_matrix) + prot_nj <- NJ(prot_dist) + prot_nj_tree <- plot(prot_nj, main = "Neighbor Joining") + write.tree(phy = prot_nj_tree, file = out_file) - ########################### - ## Approach 2 - ########################### - ## Alignment file formats and conversion - # read in sequence data, convert to phyDat - prot_fa <- read.phyDat(fa_file, format = "fasta", type = "AA") - prot_phyDat <- phyDat(prot_fa, type = "AA", levels = NULL) - prot10 <- subset(prot_phyDat, 1:10) - prot10_phyDat <- phyDat(prot10, type = "AA", levels = NULL) + ########################### + ## Approach 2 + ########################### + ## Alignment file formats and conversion + # read in sequence data, convert to phyDat + prot_fa <- read.phyDat(fa_file, format = format, type = seq_type) + prot_phyDat <- phyDat(prot_fa, type = seq_type, levels = NULL) + prot_subset <- subset(prot_phyDat, 1:subset_size) + prot_subset_phyDat <- phyDat(prot_subset, type = seq_type, levels = NULL) - ## Model Testing & Distance Matrices - ## Comparison of different nucleotide or amino acid substitution models - mt <- modelTest(prot10, model = "all") - print(mt) + ## Model Testing & Distance Matrices + ## Comparison of different nucleotide or amino acid substitution models + mt <- modelTest(prot_subset, model = model_test) + print(mt) - # estimate a distance matrix using a Jules-Cantor Model - dna_dist <- dist.ml(prot10, model = "JC69") + # estimate a distance matrix using a Jules-Cantor Model + dna_dist <- dist.ml(prot_subset, model = dna_model) - ## Neighbor Joining, UPGMA, and Maximum Parsimony - # quick and dirty UPGMA and NJ trees - prot_UPGMA <- upgma(dna_dist) - prot_NJ <- NJ(dna_dist) - plot(prot_UPGMA, main = "UPGMA") - plot(prot_NJ, main = "Neighbor Joining") + ## Neighbor Joining, UPGMA, and Maximum Parsimony + # quick and dirty UPGMA and NJ trees + prot_UPGMA <- upgma(dna_dist) + prot_NJ <- NJ(dna_dist) + plot(prot_UPGMA, main = "UPGMA") + plot(prot_NJ, main = "Neighbor Joining") - # parsimony searches - prot_optim <- optim.parsimony(prot_NJ, prot) - prot_pratchet <- pratchet(prot10) # returning error - plot(prot_optim) - plot(prot_pratchet) + # parsimony searches + prot_optim <- optim.parsimony(prot_NJ, prot_phyDat) + prot_pratchet <- pratchet(prot_subset) # returning error + plot(prot_optim) + plot(prot_pratchet) - ## Maximum likelihood and Bootstrapping - # ml estimation w/ distance matrix - fit <- pml(prot_NJ, prot10) - print(fit) - fitJC <- optim.pml(fit, model = "JC", rearrangement = "stochastic") - logLik(fitJC) - bs <- bootstrap.pml(fitJC, - bs = 100, optNni = TRUE, multicore = TRUE, - control = pml.control(trace = 0) - ) - plotBS(midpoint(fitJC$tree), bs, p = 50, type = "p") + ## Maximum likelihood and Bootstrapping + # ml estimation w/ distance matrix + fit <- pml(prot_NJ, prot_subset) + print(fit) + fitJC <- optim.pml(fit, model = ml_model, rearrangement = rearrangement) + logLik(fitJC) + bs <- bootstrap.pml(fitJC, + bs = bootstrap_reps, optNni = TRUE, multicore = TRUE, + control = pml.control(trace = 0) + ) + plotBS(midpoint(fitJC$tree), bs, p = bootstrap_p, type = plot_type) - # subsetted alignment bs example - prot10_dm <- dist.ml(prot10) - prot10_NJ <- NJ(prot10_dm) - fit2 <- pml(prot10_NJ, data = prot10) - print(fit2) - fitJC2 <- optim.pml(fit2, model = "JC", rearrangement = "stochastic") - logLik(fitJC2) - bs_subset <- bootstrap.pml(fitJC2, - bs = 100, optNni = TRUE, multicore = TRUE, - control = pml.control(trace = 0) - ) - bs2 <- plotBS(midpoint(fitJC2$tree), bs, p = 50, type = "p") + # subsetted alignment bs example + prot_subset_dm <- dist.ml(prot_subset) + prot_subset_NJ <- NJ(prot_subset_dm) + fit2 <- pml(prot_subset_NJ, data = prot_subset) + print(fit2) + fitJC2 <- optim.pml(fit2, model = ml_model, rearrangement = rearrangement) + logLik(fitJC2) + bs_subset <- bootstrap.pml(fitJC2, + bs = bootstrap_reps, optNni = TRUE, multicore = TRUE, + control = pml.control(trace = 0) + ) + bs2 <- plotBS(midpoint(fitJC2$tree), bs, p = bootstrap_p, type = plot_type) - ## Exporting Trees - write.tree(bs2, file = "bootstrap_example.tre") + ## Exporting Trees + write.tree(bs2, file = bootstrap_file) + } } +#' convertAlignment2Trees +#' @author Klangina, Outreachy Applicant +#' @description +#' Generate Trees for ALL fasta files in the specified directory. This function processes +#' multiple alignment files and creates phylogenetic trees for each, using either FastTree +#' or a detailed phylogenetic analysis approach. +#' +#' @param aln_path Character. Path to the directory containing all the alignment FASTA +#' files (.fa) for which trees will be generated. Default is here("data/alns/"). +#' @param enable_fast Logical. If TRUE, uses FastTree approach; if FALSE, uses detailed +#' phylogenetic analysis. Default is TRUE. +#' @param fasttree_path Character. Path to the FastTree executable. Default is here("src/FastTree"). +#' @param format Character. Format of the input files. Default is "fasta". +#' @param seq_type Character. Type of sequences in the input files. Default is "AA" (amino acid). +#' @param dist_matrix Character. Distance matrix to be used. Default is "similarity". +#' @param subset_size Numeric. Number of sequences to subset for analysis. Default is 10. +#' @param model_test Character. Model to use for model testing. Default is "all". +#' @param dna_model Character. Model to use for DNA distance calculation. Default is "JC69". +#' @param ml_model Character. Model to use for maximum likelihood estimation. Default is "JC". +#' @param rearrangement Character. Type of tree rearrangement for ML optimization. Default is "stochastic". +#' @param bootstrap_reps Numeric. Number of bootstrap replicates. Default is 100. +#' @param bootstrap_p Numeric. P-value threshold for bootstrap support. Default is 50. +#' @param plot_type Character. Type of plot for bootstrap tree. Default is "p". +#' @param bootstrap_file_suffix Character. Suffix for bootstrap tree files. Default is "_bootstrap.tre". +#' +#' @importFrom here here +#' @importFrom purrr pmap +#' @importFrom stringr str_replace_all +#' +#' @return No return value. The function generates tree files (.tre) and bootstrap files +#' for each alignment file in the specified directory. +#' @export +#' +#' @details This function processes all .fa files in the specified directory. For each file, +#' it generates a phylogenetic tree using either FastTree (if enable_fast is TRUE) or a +#' detailed phylogenetic analysis approach (if enable_fast is FALSE). The detailed approach +#' includes model testing, distance matrix calculation, UPGMA and NJ tree construction, +#' parsimony searches, and maximum likelihood estimation with bootstrapping. +#' +#' @note This function requires the ape, phangorn, and seqinr packages to be installed. +#' The detailed analysis approach performs computationally intensive operations and may +#' take a significant amount of time to run, especially for large datasets or with high +#' numbers of bootstrap replicates. FastTree approach is significantly faster but may +#' provide less detailed results. +#' +#' @examples +#' \dontrun{ +#' # Using FastTree approach for all alignments in the default directory +#' convertAlignment2Trees() +#' +#' # Using detailed phylogenetic analysis for alignments in a specific directory +#' convertAlignment2Trees(aln_path = "path/to/alignments/", +#' enable_fast = FALSE, +#' subset_size = 20, +#' bootstrap_reps = 500) +#' } +convertAlignment2Trees <- function( + aln_path = here("data/alns/"), + enable_fast = TRUE, + fasttree_path = here("src/FastTree"), + format = "fasta", + seq_type = "AA", + dist_matrix = "similarity", + subset_size = 10, + model_test = "all", + dna_model = "JC69", + ml_model = "JC", + rearrangement = "stochastic", + bootstrap_reps = 100, + bootstrap_p = 50, + plot_type = "p", + bootstrap_file_suffix = "_bootstrap.tre" +) { + # finding all fasta alignment files + fa_filenames <- list.files(path = aln_path, pattern = "*.fa") + fa_paths <- paste0(aln_path, fa_filenames) + variable <- str_replace_all(basename(fa_filenames), + pattern = ".fa", replacement = "" + ) + + ## Using purrr::pmap to generate trees for each of the fa files! + fa2tre_args <- list( + fa_file = fa_paths, + out_file = paste0(aln_path, variable, ".tre"), + enable_fast = enable_fast, + fasttree_path = fasttree_path, + format = format, + seq_type = seq_type, + dist_matrix = dist_matrix, + subset_size = subset_size, + model_test = model_test, + dna_model = dna_model, + ml_model = ml_model, + rearrangement = rearrangement, + bootstrap_reps = bootstrap_reps, + bootstrap_p = bootstrap_p, + plot_type = plot_type, + bootstrap_file = paste0(aln_path, variable, bootstrap_file_suffix) + ) + + pmap( + .l = fa2tre_args, + .f = createFA2Tree + ) +} \ No newline at end of file