Remove genbankr dependency (#5)

* add snp-only aln support * update_readme * update .gitignore' ' '' * update workflow * update gitignore * .gitignore updated * update gitignore * update gitignore * TODO:SnpEff_local * bundle_snpEff * bug_fix * save_fit_data * minor_bug_fix * to_do * skip_tanglegram * NewYear_bugfix * Drop genbankr dependency * remove devel test from workflows --------- Co-authored-by: Sudaraka88 <[email protected]> Co-authored-by: Sudaraka88 <[email protected]> Co-authored-by: Sudaraka88 <[email protected]>
Sudaraka88 · Jan 4, 2024 · c3ee48a · c3ee48a
1 parent 0c9bced
commit c3ee48a
Show file tree

Hide file tree

Showing 14 changed files with 1,161 additions and 53 deletions.
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
@@ -22,7 +22,7 @@ jobs:
         config:
           - {os: macos-latest,   r: 'release'}
           - {os: windows-latest, r: 'release'}
-          #- {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
+         # - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
           - {os: ubuntu-latest,   r: 'release'}
           - {os: ubuntu-latest,   r: 'oldrel-1'}
 
@@ -49,3 +49,4 @@ jobs:
       - uses: r-lib/actions/check-r-package@v2
         with:
           upload-snapshots: true
+          error-on: '"error"'
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: LDWeaver
 Type: Package
 Title:Genomewide Epistasis Analysis on Bacteria
-Version: 1.3.1
+Version: 1.4
 Authors@R: person("Sudaraka", "Mallawaarachchi", email = "[email protected]", role = c("aut", "cre"))
 Maintainer: Sudaraka Mallawaarachchi <[email protected]>
 Description:Perform genomewide epistasis analysis by evaluating the LD structure in bacteria.
@@ -12,12 +12,13 @@ biocViews: Software
 Depends: R (>= 4.0.0),
 Imports: 
          ape,
+         Biostrings,
          chromoMap,
          data.table,
          dplyr,
          fitdistrplus,
-         genbankr,
          GenomicRanges,
+         GenomeInfoDb,
          ggnewscale,
          ggplot2,
          ggtree,
@@ -26,6 +27,7 @@ Imports:
          heatmap3,
          htmlwidgets,
          igraph,
+         IRanges,
          Matrix,
          MatrixExtra,
          methods,
@@ -35,8 +37,10 @@ Imports:
          RColorBrewer,
          Rcpp,
          RcppArmadillo,
+         S4Vectors,
          stats,
-         utils
+         utils,
+         VariantAnnotation
 LinkingTo: Rcpp, RcppArmadillo
 RoxygenNote: 7.2.3
 URL: https://github.com/Sudaraka88/LDWeaver

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(GBAccession)
 export(LDWeaver)
 export(analyse_long_range_links)
 export(cleanup)
@@ -24,9 +25,28 @@ export(read_TopHits)
 export(snpdat_to_fa)
 export(view_tree)
 export(write_output_for_gwes_explorer)
+exportClasses(GBAccession)
+exportClasses(GBKFile)
+exportClasses(GenBankFile)
+exportClasses(GenBankRecord)
+import(GenomicRanges)
+importClassesFrom(Biostrings,XStringSet)
+importClassesFrom(GenomicRanges,CompressedGRangesList)
+importClassesFrom(GenomicRanges,GRangesList)
+importFrom(Biostrings,AAString)
+importFrom(Biostrings,AAStringSet)
+importFrom(Biostrings,DNAString)
+importFrom(Biostrings,extractAt)
+importFrom(GenomeInfoDb,Seqinfo)
+importFrom(GenomeInfoDb,seqinfo)
+importFrom(GenomeInfoDb,seqlevels)
+importFrom(GenomeInfoDb,seqnames)
 importFrom(GenomicRanges,end)
 importFrom(GenomicRanges,start)
 importFrom(GenomicRanges,width)
+importFrom(IRanges,IRanges)
+importFrom(IRanges,heads)
+importFrom(IRanges,ranges)
 importFrom(Matrix,colSums)
 importFrom(Matrix,rowSums)
 importFrom(Matrix,sparseMatrix)
@@ -38,6 +58,11 @@ importFrom(MatrixExtra,tcrossprod)
 importFrom(RColorBrewer,brewer.pal)
 importFrom(Rcpp,sourceCpp)
 importFrom(RcppArmadillo,fastLm)
+importFrom(S4Vectors,DataFrame)
+importFrom(S4Vectors,queryHits)
+importFrom(S4Vectors,subjectHits)
+importFrom(VariantAnnotation,VRanges)
+importFrom(VariantAnnotation,makeVRangesFromGRanges)
 importFrom(ape,read.gff)
 importFrom(ape,read.tree)
 importFrom(chromoMap,chromoMap)
@@ -49,9 +74,6 @@ importFrom(data.table,setattr)
 importFrom(dplyr,`%>%`)
 importFrom(dplyr,summarise)
 importFrom(fitdistrplus,fitdist)
-importFrom(genbankr,cds)
-importFrom(genbankr,getSeq)
-importFrom(genbankr,readGenBank)
 importFrom(ggnewscale,new_scale_fill)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,facet_wrap)
@@ -76,6 +98,8 @@ importFrom(htmlwidgets,saveWidget)
 importFrom(igraph,graph_from_edgelist)
 importFrom(igraph,set.edge.attribute)
 importFrom(methods,as)
+importFrom(methods,is)
+importFrom(methods,new)
 importFrom(parallel,detectCores)
 importFrom(phytools,midpoint.root)
 importFrom(plyr,.)
@@ -88,8 +112,10 @@ importFrom(stats,hclust)
 importFrom(stats,kmeans)
 importFrom(stats,pbeta)
 importFrom(stats,quantile)
+importFrom(utils,packageVersion)
 importFrom(utils,read.table)
 importFrom(utils,setTxtProgressBar)
+importFrom(utils,stack)
 importFrom(utils,timestamp)
 importFrom(utils,txtProgressBar)
 importFrom(utils,write.table)

diff --git a/R/BacGWES.R b/R/BacGWES.R
@@ -3,7 +3,7 @@
 #' Function to run the LDWeaver pipeline
 #'
 #' @importFrom parallel detectCores
-#' @importFrom utils timestamp
+#' @importFrom utils timestamp packageVersion
 #'
 #' @param dset name of the dataset, all outputs will be saved to the folder <dset>
 #' @param aln_path path to the multi fasta alignment
@@ -29,7 +29,8 @@
 #' @param srp_cutoff specify the short-range -log10(p) cut-off value to discard short-range links before returning the data.frame. This setting has no impact on the
 #' modelling since all links are used. However, setting a threshold > 2 will generally reduce the memory usage, plotting time (default = 3, i.e. corresponding to p = 0.001),
 #' and run time for ARACNE. If all links are required to be returned, set to 0 (i.e. corresponding to p = 1), range 0 - 5
-#' @param tanglegram_break_segments specify the number of genome segments to prepare - one tanglegram per segment (default = 5), range 1 - 10
+#' @param tanglegram_break_segments specify the number of genome segments to prepare - one tanglegram per segment (default = 5), range 1 - 10. Set NULL to skip tanglegram
+#' @param write_gwesExplorer specify whether output for GWESExplorer is required (default = T)
 #' @param multicore specify whether to use parallel processing (default = T)
 #' @param ncores specify the number of cores to use for parallel processing (default = NULL), will auto detect if NULL
 #' @param max_blk_sz specify maximum block size for MI computation (default = 10000), larger sizes require more RAM, range 1000 - 100000
@@ -48,7 +49,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
                     gap_freq = 0.15, maf_freq = 0.01, hdw_threshold = 0.1, perform_SR_analysis_only = F,
                     SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6,
                     max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5,
-                    multicore = T, max_blk_sz = 10000, ncores = NULL, save_additional_outputs = F){
+                    write_gwesExplorer = T, multicore = T, max_blk_sz = 10000, ncores = NULL,
+                    save_additional_outputs = F){
   # Build blocks
   # BLK1: Extract SNPs and create sparse Mx from MSA (fasta)
   # BLK2: Parse GBK or GFF+REF
@@ -62,8 +64,11 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
   # BLK10: GWESExplorer (depends: GWESExplorer)
   # BLK11: Cleanup
 
+  #TODO: Provide the option to skip SNP extraction and use the whole provided alignment (redundant if pre-filtered)
   #TODO: Add the option to provide genbank file without reference sequence
   #TODO: Count through blocks and automate the displayed BLOCK NUMBER
+  #TODO: genbankr is being droped from the newest bioconductor, add alternative (https://github.com/gmbecker/genbankr)
+  #TODO: Add Hamming Distance plot, can we have a SNP Tree + Hamming Distance weights to show population structure control?
 
   #NOTE: SnpEff does not parse the GBK and GFF3 file from the same refseq reference genome the same way. There might be differences between annotations/tophits/etc.
   # # Welcome message # #
@@ -138,11 +143,12 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
     srp_cutoff = 3
   }
 
-  if(tanglegram_break_segments < 0 | tanglegram_break_segments > 10) {
-    warning(paste("Unable to use the provided value for <tanglegram_break_segments>, using", 5))
-    tanglegram_break_segments = 5
+  if(!is.null(tanglegram_break_segments)){
+    if(tanglegram_break_segments < 0 | tanglegram_break_segments > 10) {
+      warning(paste("Unable to use the provided value for <tanglegram_break_segments>, using", 5))
+      tanglegram_break_segments = 5
+    }
   }
-
   if(max_blk_sz < 1000 | max_blk_sz > 100000) {
     warning(paste("Unable to use the provided value for <max_blk_sz>, using", 10000, "...!If this value is causing the function to crash, consider reducing!..."))
     max_blk_sz = 10000
@@ -162,6 +168,12 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
 
   # setup paths
   if(!file.exists(dset)) dir.create(dset) # save everything in here
+
+  # Save console output as a text file
+  info_file = file.path(dset, paste("LDW_run_",format(Sys.time(), "%Y%m%d%H%M%S"), ".txt", sep = ""))
+  suppressWarnings(sink(file= NULL))
+  sink(info_file, split = T)
+
   add_path = file.path(dset, "Additional_Outputs") # Additional Outputs
   if(save_additional_outputs) {
     if(!file.exists(add_path)) dir.create(file.path(dset, "Additional_Outputs"))
@@ -203,7 +215,11 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
     cat(paste("All outputs will be saved to:", normalizePath(dset), "\n"))
     cat(paste("\n *** Input paths *** \n\n"))
     cat(paste("* Alignment:", aln_path, "\n"))
-    cat(paste("* GenBank Annotation:", gbk_path, "\n"))
+    if(!is.null(gbk_path)) {
+      cat(paste("* GenBank Annotation:", gbk_path, "\n"))
+      cat(paste("* Parser built using genbankr source (https://github.com/gmbecker/genbankr) \n"))
+    }
+    if(!is.null(gff3_path)) cat(paste("* GFF3 Annotation:", gff3_path, "\n"))
     if(!is.null(snpeff_jar_path)) cat(paste("* SnpEff Annotations will be performed on short-range links. SnpEff path:", snpeff_jar_path, "\n"))
 
     cat(paste("\n *** Parameters *** \n\n"))
@@ -356,6 +372,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
 
 
   if(nrow(sr_links) == 0){
+    suppressWarnings(sink(file= NULL)) ### output info to text file
     stop("No potentially important sr_links were identified! Cannot continue analysis...")
   }
 
@@ -383,25 +400,30 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
     tophits = LDWeaver::read_TopHits(top_hits_path = tophits_path)
   }
 
-  # Additional paths if annotations are requested
-  # tanglegram
-  tanglegram_path = file.path(dset, "SR_Tanglegram")
-  if(!file.exists(tanglegram_path)) dir.create(tanglegram_path)
-  # GWESExplorer
-  gwesexplorer_path = file.path(dset, "SR_GWESExplorer")
-  if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path)
-  # NetworkPlot
-  netplot_path = file.path(dset, "SR_network_plot.png")
 
   # BLK8
-  cat("\n\n #################### BLOCK 9 #################### \n\n")
-  LDWeaver::create_tanglegram(tophits = tophits, gbk = gbk, gff = gff, tanglegram_folder = tanglegram_path, break_segments = tanglegram_break_segments)
-
+  if(!is.null(tanglegram_break_segments)){
+    # tanglegram
+    tanglegram_path = file.path(dset, "SR_Tanglegram")
+    if(!file.exists(tanglegram_path)) dir.create(tanglegram_path)
+    cat("\n\n #################### BLOCK 9 #################### \n\n")
+    LDWeaver::create_tanglegram(tophits = tophits, gbk = gbk, gff = gff, tanglegram_folder = tanglegram_path, break_segments = tanglegram_break_segments)
+  }
   # BLK9
-  cat("\n\n #################### BLOCK 10 #################### \n\n")
-  LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path)
+  if(write_gwesExplorer){
+    # GWESExplorer
+    gwesexplorer_path = file.path(dset, "SR_GWESExplorer")
+    if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path)
+    cat("\n\n #################### BLOCK 10 #################### \n\n")
+    LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path)
+  }
+
 
   # BLK10
+  # Additional paths if annotations are requested
+  # NetworkPlot
+  netplot_path = file.path(dset, "SR_network_plot.png")
+
   cat("\n\n #################### BLOCK 11 #################### \n\n")
   LDWeaver::create_network(tophits = tophits, netplot_path = netplot_path, plot_title = paste("Networks in short-range tophits for", dset))
 

diff --git a/R/createTanglegram.R b/R/createTanglegram.R
@@ -4,7 +4,6 @@
 #'
 #' @importFrom htmlwidgets saveWidget
 #' @importFrom stats cutree hclust dist
-#' @importFrom genbankr cds
 #' @importFrom plyr . ddply
 #' @importFrom chromoMap chromoMap
 #'

diff --git a/R/estimateCDSDiversity.R b/R/estimateCDSDiversity.R
@@ -3,7 +3,6 @@
 #' Function to estimate the variation within each coding region, the output from this function
 #' can be used to segment the genome into diversity-based clusters.
 #'
-#' @importFrom genbankr cds getSeq
 #' @importFrom GenomicRanges start width end
 #' @importFrom Matrix rowSums colSums
 #' @importFrom data.table data.table setattr %between% .I
@@ -25,6 +24,9 @@
 #' }
 #' @export
 estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, num_clusts_CDS = 3, clust_plt_path = NULL){
+  ## NOTE: genbankr depreciation, removed the following import
+  # importFrom genbankr cds getSeq
+
   # This method is only approximate, but much MUCH faster and easier on resources
   # TODO: Include the higher accuracy function
   t0 = Sys.time()
@@ -34,12 +36,14 @@ estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, nu
 
   # extract the information we need
   if(!is.null(gbk)){
-    cds_reg = genbankr::cds(gbk)
+    # cds_reg = genbankr::cds(gbk) # no longer exporting this function after genbankr depreciation
+    cds_reg = gbk@cds
     starts = GenomicRanges::start(cds_reg)
     widths = GenomicRanges::width(cds_reg)
     ends = GenomicRanges::end(cds_reg)
     # convert ref to a CharacterVector
-    ref = unlist(unname(strsplit(as.character(genbankr::getSeq(gbk)), '')))[snp.dat$POS]
+    # ref = unlist(unname(strsplit(as.character(genbankr::getSeq(gbk)), '')))[snp.dat$POS] # no longer exporting this function after genbankr deprecation
+    ref = unlist(unname(strsplit(as.character(gbk@sequence), '')))[snp.dat$POS]
   } else if(!is.null(gff)){
     gff_cds = gff$gff[tolower(gff$gff$type) == "cds", ]
     starts = gff_cds$start

diff --git a/R/io_functions.R b/R/io_functions.R
@@ -250,7 +250,7 @@ cleanup = function(dset, delete_after_moving = F){
     mv_success = c(mv_success, idx)
   }
 
-  idx = c(grep("cds_var.rds", files), grep("hdw.rds", files), grep("parsed_gbk.rds", files), grep("snp_ACGTN.rds", files))
+  idx = c(grep("cds_var.rds", files), grep("hdw.rds", files), grep("parsed_gbk.rds", files), grep("parsed_gff3.rds", files), grep("snp_ACGTN.rds", files))
   if(length(idx) > 0){
     fldr = file.path(dset, "Additional_Outputs")
     cleanup_support(files = file.path(dset, files[idx]), fldr)
@@ -298,7 +298,7 @@ cleanup = function(dset, delete_after_moving = F){
   }
 
   #### Temp folder ####
-  idx = c(grep("snpEff", files), grep("*.vcf", files), grep("*annotations.tsv", files), grep("*_links.tsv", files))
+  idx = c(grep("snpEff", files), grep("*.vcf", files), grep("*annotations.tsv", files), grep("*_links.tsv", files), grep("LDW_run_*", files))
   if(length(idx) > 0){
     fldr = file.path(dset, "Temp")
     cleanup_support(files = file.path(dset, files[idx]), fldr)

diff --git a/R/lr_analyser.R b/R/lr_analyser.R
@@ -44,16 +44,14 @@ analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrli
   # NOTE: spydrpick does not add clusters, add them from paint (requires cds_var)
 
   if(SnpEff_Annotate == T) {
-    if( (is.null(gbk_path) & is.null(gff3_path)) | (!is.null(gbk_path) & !is.null(gff3_path)) ) stop("Either gbk_path or gff3_path must be provided.
-                                                                                                     To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL
-    if(!is.null(gff3_path) & is.null(ref_fasta_path)) stop("Reference fasta file must be provided for gff3 annoations.
-                                                           To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL
-    if(is.null(snpeff_jar_path)) stop("You must specify <snpeff_jar_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
-    if(!file.exists(snpeff_jar_path)) stop(paste("<SnpEff.jar> not found at:", snpeff_jar_path, "please check the path provided"))
-    if(is.null(snpeff_jar_path)) stop("You must specify <snpeff_jar_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
+    if( (is.null(gbk_path) & is.null(gff3_path)) | (!is.null(gbk_path) & !is.null(gff3_path)) ) stop("Either gbk_path or gff3_path must be provided. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL
+    if(!is.null(gff3_path) & is.null(ref_fasta_path)) stop("Reference fasta file must be provided for gff3 annoations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL
+    # if(is.null(snpeff_jar_path)) stop("You must specify <snpeff_jar_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
+    # if(!file.exists(snpeff_jar_path)) stop(paste("<SnpEff.jar> not found at:", snpeff_jar_path, "please check the path provided"))
+    # if(is.null(snpeff_jar_path)) stop("You must specify <snpeff_jar_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
     # if(is.null(gbk_path)) stop("You must specify <gbk_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
-    if(is.null(snp.dat)) stop("You must specify <gbk_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
-    if(is.null(cds_var)) stop("You must specify <gbk_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
+    if(is.null(snp.dat)) stop("You must provide snp.dat to perform annotations.")
+    if(is.null(cds_var)) stop("You must specify cds_var to perform for annotations.")
   }
 
   # lr_links_path = "~/Desktop/LDWeaver_RUN/maela/lr_links.tsv"