Improve Readme

Sudaraka88 · Apr 19, 2024 · 67bf09a · 67bf09a
1 parent e354000
commit 67bf09a
Show file tree

Hide file tree

Showing 11 changed files with 57 additions and 32 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@
 .DS_Store
 *.Rmd
 Makevars
+testscripts.R
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -10,7 +10,7 @@ Encoding: UTF-8
 LazyData: true
 biocViews: Software
 Depends: R (>= 4.0.0),
-Imports: 
+Imports:          
          ape,
          Biostrings,
          chromoMap,
@@ -19,9 +19,7 @@ Imports:
          fitdistrplus,
          GenomicRanges,
          GenomeInfoDb,
-         ggnewscale,
          ggplot2,
-         ggtree,
          ggraph,
          grDevices,
          heatmap3,
@@ -32,7 +30,6 @@ Imports:
          MatrixExtra,
          methods,
          parallel,
-         phytools,
          plyr,
          RColorBrewer,
          Rcpp,
@@ -42,6 +39,9 @@ Imports:
          utils,
          VariantAnnotation
 Suggests:
+         phytools,
+         ggtree,
+         ggnewscale,
          spam,
          spam64
 LinkingTo: Rcpp, RcppArmadillo

diff --git a/NAMESPACE b/NAMESPACE
@@ -66,7 +66,6 @@ importFrom(S4Vectors,subjectHits)
 importFrom(VariantAnnotation,VRanges)
 importFrom(VariantAnnotation,makeVRangesFromGRanges)
 importFrom(ape,read.gff)
-importFrom(ape,read.tree)
 importFrom(chromoMap,chromoMap)
 importFrom(data.table,"%between%")
 importFrom(data.table,.I)
@@ -76,7 +75,6 @@ importFrom(data.table,setattr)
 importFrom(dplyr,`%>%`)
 importFrom(dplyr,summarise)
 importFrom(fitdistrplus,fitdist)
-importFrom(ggnewscale,new_scale_fill)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,facet_wrap)
 importFrom(ggplot2,geom_point)
@@ -91,8 +89,6 @@ importFrom(ggraph,geom_edge_arc2)
 importFrom(ggraph,geom_node_label)
 importFrom(ggraph,ggraph)
 importFrom(ggraph,scale_edge_colour_discrete)
-importFrom(ggtree,ggtree)
-importFrom(ggtree,gheatmap)
 importFrom(grDevices,colorRampPalette)
 importFrom(grDevices,png)
 importFrom(heatmap3,heatmap3)
@@ -103,7 +99,6 @@ importFrom(methods,as)
 importFrom(methods,is)
 importFrom(methods,new)
 importFrom(parallel,detectCores)
-importFrom(phytools,midpoint.root)
 importFrom(plyr,.)
 importFrom(plyr,ddply)
 importFrom(stats,coef)

diff --git a/R/BacGWES.R b/R/BacGWES.R
@@ -35,7 +35,7 @@
 #' @param ncores specify the number of cores to use for parallel processing (default = NULL), will auto detect if NULL
 #' @param max_blk_sz specify maximum block size for MI computation (default = 10000), larger sizes require more RAM, range 1000 - 100000
 #' @param save_additional_outputs specify whether to save outputs such as extracted SNPs and Hamming distance weights. Recommended for very large datasets to save time on re-computation (default = F)
-#'
+#' @param mega_dset To analyse megascale datasets using spam and spam64 packages, set to TRUE (default = F). Only do so if the normal analysis fails (since LDWeaver 1.5)
 #'
 #' @return Numeric Value 0 if successful (all generated outputs will be saved)
 #'
@@ -50,7 +50,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
                     SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6,
                     max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5,
                     write_gwesExplorer = T, multicore = T, max_blk_sz = 10000, ncores = NULL,
-                    save_additional_outputs = F){
+                    save_additional_outputs = F, mega_dset = F){
+
   # Build blocks
   # BLK1: Extract SNPs and create sparse Mx from MSA (fasta)
   # BLK2: Parse GBK or GFF+REF
@@ -65,10 +66,10 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
   # BLK11: Cleanup
 
   #TODO: Provide the option to skip SNP extraction and use the whole provided alignment (redundant if pre-filtered)
-  #TODO: Add the option to provide genbank file without reference sequence
+  #TODO: Add the option to provide GFF file without reference sequence
   #TODO: Count through blocks and automate the displayed BLOCK NUMBER
-  #TODO: genbankr is being droped from the newest bioconductor, add alternative (https://github.com/gmbecker/genbankr)
   #TODO: Add Hamming Distance plot, can we have a SNP Tree + Hamming Distance weights to show population structure control?
+  #TODO: Drop ggtree dependency
 
   #NOTE: SnpEff does not parse the GBK and GFF3 file from the same refseq reference genome the same way. There might be differences between annotations/tophits/etc.
   # # Welcome message # #
@@ -158,6 +159,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
     validate_ref_ann_lengths = F
   }
 
+
+
   # normalise_input_paths
   aln_path = normalizePath(aln_path)
   if(!is.null(gbk_path)) gbk_path = normalizePath(gbk_path)
@@ -318,7 +321,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
   cat("\n\n #################### BLOCK 3 #################### \n\n")
   if(!file.exists(cds_var_path)) {
     cat("Estimating the variation in CDS \n")
-    cds_var = LDWeaver::estimate_variation_in_CDS(gbk = gbk, gff = gff, snp.dat = snp.dat, ncores = ncores, num_clusts_CDS = num_clusts_CDS, clust_plt_path = clust_plt_path)
+    cds_var = LDWeaver::estimate_variation_in_CDS(gbk = gbk, gff = gff, snp.dat = snp.dat, ncores = ncores, num_clusts_CDS = num_clusts_CDS, clust_plt_path = clust_plt_path, mega_dset = mega_dset)
     if(save_additional_outputs){
       saveRDS(cds_var, cds_var_path)
     }

diff --git a/R/estimateCDSDiversity.R b/R/estimateCDSDiversity.R
@@ -15,6 +15,7 @@
 #' @param gff output from parsing the gff3 file using LDWeaver::parse_gff_file()
 #' @param num_clusts_CDS parition to genome into num_clusts_CDS regions using k-means (default = 3)
 #' @param clust_plt_path specify path to save CDS variation plot
+#' @param mega_dset set to TRUE for mega scale datasets (default = F)
 #'
 #' @return R list with CDS variation and allele distribution details
 #'
@@ -23,7 +24,7 @@
 #' cds_var <- estimate_variation_in_CDS(gbk, snp.dat, ncores = 10)
 #' }
 #' @export
-estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, num_clusts_CDS = 3, clust_plt_path = NULL){
+estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, num_clusts_CDS = 3, clust_plt_path = NULL, mega_dset = FALSE){
   ## NOTE: genbankr depreciation, removed the following import
   # importFrom genbankr cds getSeq
 
@@ -57,12 +58,28 @@ estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, nu
   var_estimate = rep(NA, ncds)
 
 
-  variation = matrix(c(Matrix::rowSums(snp.dat$snp.matrix_A),
-                       Matrix::rowSums(snp.dat$snp.matrix_C),
-                       Matrix::rowSums(snp.dat$snp.matrix_G),
-                       Matrix::rowSums(snp.dat$snp.matrix_T),
-                       Matrix::rowSums(snp.dat$snp.matrix_N)),
-                     ncol = snp.dat$nsnp, byrow = T)
+  if(mega_dset){ # Using SPAM
+    if(!requireNamespace("spam") & !requireNamespace("spam64")){
+      message("This feature requires spam and spam64 packages.")
+      return(invisible())
+    }
+    variation = matrix(c(spam::rowSums(snp.dat$snp.matrix_A),
+                         spam::rowSums(snp.dat$snp.matrix_C),
+                         spam::rowSums(snp.dat$snp.matrix_G),
+                         spam::rowSums(snp.dat$snp.matrix_T),
+                         spam::rowSums(snp.dat$snp.matrix_N)),
+                       ncol = snp.dat$nsnp, byrow = T)
+
+  } else { # Using the Matrix package
+    variation = matrix(c(Matrix::rowSums(snp.dat$snp.matrix_A),
+                         Matrix::rowSums(snp.dat$snp.matrix_C),
+                         Matrix::rowSums(snp.dat$snp.matrix_G),
+                         Matrix::rowSums(snp.dat$snp.matrix_T),
+                         Matrix::rowSums(snp.dat$snp.matrix_N)),
+                       ncol = snp.dat$nsnp, byrow = T)
+
+  }
+
 
   # Generate a reference masking mx with 0 at reference allele
   reference = matrix(rep(1, 5*snp.dat$nsnp), nrow = 5); .ACGTN2num(reference, ref, ncores)

diff --git a/R/extractSNPs_mega.R b/R/extractSNPs_mega.R
@@ -66,7 +66,7 @@ parse_fasta_alignment_mega <- function(aln_path, gap_freq = 0.15, maf_freq = 0.0
                                  nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
   snp.data$i_T = snp.data$j_T = snp.data$x_T = NULL
 
-  snp.matrix_N_spam <- spam::spam(list(i=snp.data$i_A, j=snp.data$j_A, values=as.logical(snp.data$x_A)),
+  snp.matrix_N_spam <- spam::spam(list(i=snp.data$i_N, j=snp.data$j_N, values=as.logical(snp.data$x_N)),
                                  nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
   snp.data = NULL
 
@@ -159,7 +159,7 @@ parse_fasta_SNP_alignment_mega <- function(aln_path, pos, gap_freq = 0.15, maf_f
                                   nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
   snp.data$i_T = snp.data$j_T = snp.data$x_T = NULL
 
-  snp.matrix_N_spam <- spam::spam(list(i=snp.data$i_A, j=snp.data$j_A, values=as.logical(snp.data$x_A)),
+  snp.matrix_N_spam <- spam::spam(list(i=snp.data$i_N, j=snp.data$j_N, values=as.logical(snp.data$x_N)),
                                   nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
   snp.data = NULL
 

diff --git a/R/preptrees.R b/R/preptrees.R
@@ -6,11 +6,7 @@
 #' can be provided. A subset of manually chosen links can also be provided as a data.frame. The fasta/position files can be generated using LDWeaver::generate_Links_SNPS_fasta().
 #' If metadata is available, they can be visualised along with the allele data in a separate panel.
 #'
-#' @importFrom ape read.tree
-#' @importFrom phytools midpoint.root
-#' @importFrom ggtree ggtree gheatmap
 #' @importFrom ggplot2 ggsave scale_fill_viridis_d
-#' @importFrom ggnewscale new_scale_fill
 #'
 #' @param tree_path Path to tree file (must be readable using ape::read.tree). tip.labels must exactly match the sequence names in the fasta and metadata files.
 #'
@@ -56,6 +52,13 @@ view_tree = function(tree_path, perform_midpoint_rooting = T, metadata_df = NULL
 
   # Change the file to give an optional fasta + pos file genereated from LDWeaver::snpdat_to_fa()
 
+  ## There is a chance some people don't use LDWeaver to generate these plots, convert these requirements to suggestions
+  if(!requireNamespace("ape") & !requireNamespace("phytools") & !requireNamespace("ggtree") & !requireNamespace("ggnewscale")){
+    message("This feature requires ape, phytools, ggtrree and ggnewscale packages, please make sure they are installed.")
+    return(invisible())
+  }
+
+
 
   # sanity checks and IO
 

diff --git a/README.md b/README.md
@@ -1,13 +1,13 @@
+## Genomewide Co-selection and Epistasis in Bacteria <img src='images/icon.jpg' align="right" height="100" />
+
 <!-- badges: start -->
 
 [![R](https://github.com/Sudaraka88/LDWeaver/workflows/R-CMD-check/badge.svg)](https://github.com/Sudaraka88/LDWeaver/actions)
 [![DOI](https://zenodo.org/badge/590009521.svg)](https://zenodo.org/badge/latestdoi/590009521)
 [![LICESNSE](https://anaconda.org/bioconda/r-ldweaver/badges/license.svg)](https://spdx.org/licenses/GPL-3.0-or-later.html)
 <!-- badges: end -->
 
-# LDWeaver
-
-## Genomewide Search for Evidence of Co-selection and Epistasis in Bacteria
+## About
 
 LDWeaver accepts a sequence alignment (fasta) and its reference annotation 
 (genbank or gff) as inputs and identifies linkage disequilibrium (LD) between 

diff --git a/images/icon.jpg b/images/icon.jpg
diff --git a/man/LDWeaver.Rd b/man/LDWeaver.Rd
diff --git a/man/estimate_variation_in_CDS.Rd b/man/estimate_variation_in_CDS.Rd