Skip to content

Commit

Permalink
Remove genbankr dependency (#5)
Browse files Browse the repository at this point in the history
* add snp-only aln support

* update_readme

* update .gitignore'

'
''

* update workflow

* update gitignore

* .gitignore updated

* update gitignore

* update gitignore

* TODO:SnpEff_local

* bundle_snpEff

* bug_fix

* save_fit_data

* minor_bug_fix

* to_do

* skip_tanglegram

* NewYear_bugfix

* Drop genbankr dependency

* remove devel test from workflows

---------

Co-authored-by: Sudaraka88 <[email protected]>
Co-authored-by: Sudaraka88 <[email protected]>
Co-authored-by: Sudaraka88 <[email protected]>
  • Loading branch information
4 people authored Jan 4, 2024
1 parent 0c9bced commit c3ee48a
Show file tree
Hide file tree
Showing 14 changed files with 1,161 additions and 53 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/r.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
config:
- {os: macos-latest, r: 'release'}
- {os: windows-latest, r: 'release'}
#- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
# - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-latest, r: 'release'}
- {os: ubuntu-latest, r: 'oldrel-1'}

Expand All @@ -49,3 +49,4 @@ jobs:
- uses: r-lib/actions/check-r-package@v2
with:
upload-snapshots: true
error-on: '"error"'
10 changes: 7 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: LDWeaver
Type: Package
Title:Genomewide Epistasis Analysis on Bacteria
Version: 1.3.1
Version: 1.4
Authors@R: person("Sudaraka", "Mallawaarachchi", email = "[email protected]", role = c("aut", "cre"))
Maintainer: Sudaraka Mallawaarachchi <[email protected]>
Description:Perform genomewide epistasis analysis by evaluating the LD structure in bacteria.
Expand All @@ -12,12 +12,13 @@ biocViews: Software
Depends: R (>= 4.0.0),
Imports:
ape,
Biostrings,
chromoMap,
data.table,
dplyr,
fitdistrplus,
genbankr,
GenomicRanges,
GenomeInfoDb,
ggnewscale,
ggplot2,
ggtree,
Expand All @@ -26,6 +27,7 @@ Imports:
heatmap3,
htmlwidgets,
igraph,
IRanges,
Matrix,
MatrixExtra,
methods,
Expand All @@ -35,8 +37,10 @@ Imports:
RColorBrewer,
Rcpp,
RcppArmadillo,
S4Vectors,
stats,
utils
utils,
VariantAnnotation
LinkingTo: Rcpp, RcppArmadillo
RoxygenNote: 7.2.3
URL: https://github.com/Sudaraka88/LDWeaver
Expand Down
32 changes: 29 additions & 3 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(GBAccession)
export(LDWeaver)
export(analyse_long_range_links)
export(cleanup)
Expand All @@ -24,9 +25,28 @@ export(read_TopHits)
export(snpdat_to_fa)
export(view_tree)
export(write_output_for_gwes_explorer)
exportClasses(GBAccession)
exportClasses(GBKFile)
exportClasses(GenBankFile)
exportClasses(GenBankRecord)
import(GenomicRanges)
importClassesFrom(Biostrings,XStringSet)
importClassesFrom(GenomicRanges,CompressedGRangesList)
importClassesFrom(GenomicRanges,GRangesList)
importFrom(Biostrings,AAString)
importFrom(Biostrings,AAStringSet)
importFrom(Biostrings,DNAString)
importFrom(Biostrings,extractAt)
importFrom(GenomeInfoDb,Seqinfo)
importFrom(GenomeInfoDb,seqinfo)
importFrom(GenomeInfoDb,seqlevels)
importFrom(GenomeInfoDb,seqnames)
importFrom(GenomicRanges,end)
importFrom(GenomicRanges,start)
importFrom(GenomicRanges,width)
importFrom(IRanges,IRanges)
importFrom(IRanges,heads)
importFrom(IRanges,ranges)
importFrom(Matrix,colSums)
importFrom(Matrix,rowSums)
importFrom(Matrix,sparseMatrix)
Expand All @@ -38,6 +58,11 @@ importFrom(MatrixExtra,tcrossprod)
importFrom(RColorBrewer,brewer.pal)
importFrom(Rcpp,sourceCpp)
importFrom(RcppArmadillo,fastLm)
importFrom(S4Vectors,DataFrame)
importFrom(S4Vectors,queryHits)
importFrom(S4Vectors,subjectHits)
importFrom(VariantAnnotation,VRanges)
importFrom(VariantAnnotation,makeVRangesFromGRanges)
importFrom(ape,read.gff)
importFrom(ape,read.tree)
importFrom(chromoMap,chromoMap)
Expand All @@ -49,9 +74,6 @@ importFrom(data.table,setattr)
importFrom(dplyr,`%>%`)
importFrom(dplyr,summarise)
importFrom(fitdistrplus,fitdist)
importFrom(genbankr,cds)
importFrom(genbankr,getSeq)
importFrom(genbankr,readGenBank)
importFrom(ggnewscale,new_scale_fill)
importFrom(ggplot2,aes)
importFrom(ggplot2,facet_wrap)
Expand All @@ -76,6 +98,8 @@ importFrom(htmlwidgets,saveWidget)
importFrom(igraph,graph_from_edgelist)
importFrom(igraph,set.edge.attribute)
importFrom(methods,as)
importFrom(methods,is)
importFrom(methods,new)
importFrom(parallel,detectCores)
importFrom(phytools,midpoint.root)
importFrom(plyr,.)
Expand All @@ -88,8 +112,10 @@ importFrom(stats,hclust)
importFrom(stats,kmeans)
importFrom(stats,pbeta)
importFrom(stats,quantile)
importFrom(utils,packageVersion)
importFrom(utils,read.table)
importFrom(utils,setTxtProgressBar)
importFrom(utils,stack)
importFrom(utils,timestamp)
importFrom(utils,txtProgressBar)
importFrom(utils,write.table)
Expand Down
66 changes: 44 additions & 22 deletions R/BacGWES.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#' Function to run the LDWeaver pipeline
#'
#' @importFrom parallel detectCores
#' @importFrom utils timestamp
#' @importFrom utils timestamp packageVersion
#'
#' @param dset name of the dataset, all outputs will be saved to the folder <dset>
#' @param aln_path path to the multi fasta alignment
Expand All @@ -29,7 +29,8 @@
#' @param srp_cutoff specify the short-range -log10(p) cut-off value to discard short-range links before returning the data.frame. This setting has no impact on the
#' modelling since all links are used. However, setting a threshold > 2 will generally reduce the memory usage, plotting time (default = 3, i.e. corresponding to p = 0.001),
#' and run time for ARACNE. If all links are required to be returned, set to 0 (i.e. corresponding to p = 1), range 0 - 5
#' @param tanglegram_break_segments specify the number of genome segments to prepare - one tanglegram per segment (default = 5), range 1 - 10
#' @param tanglegram_break_segments specify the number of genome segments to prepare - one tanglegram per segment (default = 5), range 1 - 10. Set NULL to skip tanglegram
#' @param write_gwesExplorer specify whether output for GWESExplorer is required (default = T)
#' @param multicore specify whether to use parallel processing (default = T)
#' @param ncores specify the number of cores to use for parallel processing (default = NULL), will auto detect if NULL
#' @param max_blk_sz specify maximum block size for MI computation (default = 10000), larger sizes require more RAM, range 1000 - 100000
Expand All @@ -48,7 +49,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
gap_freq = 0.15, maf_freq = 0.01, hdw_threshold = 0.1, perform_SR_analysis_only = F,
SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6,
max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5,
multicore = T, max_blk_sz = 10000, ncores = NULL, save_additional_outputs = F){
write_gwesExplorer = T, multicore = T, max_blk_sz = 10000, ncores = NULL,
save_additional_outputs = F){
# Build blocks
# BLK1: Extract SNPs and create sparse Mx from MSA (fasta)
# BLK2: Parse GBK or GFF+REF
Expand All @@ -62,8 +64,11 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
# BLK10: GWESExplorer (depends: GWESExplorer)
# BLK11: Cleanup

#TODO: Provide the option to skip SNP extraction and use the whole provided alignment (redundant if pre-filtered)
#TODO: Add the option to provide genbank file without reference sequence
#TODO: Count through blocks and automate the displayed BLOCK NUMBER
#TODO: genbankr is being droped from the newest bioconductor, add alternative (https://github.com/gmbecker/genbankr)
#TODO: Add Hamming Distance plot, can we have a SNP Tree + Hamming Distance weights to show population structure control?

#NOTE: SnpEff does not parse the GBK and GFF3 file from the same refseq reference genome the same way. There might be differences between annotations/tophits/etc.
# # Welcome message # #
Expand Down Expand Up @@ -138,11 +143,12 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
srp_cutoff = 3
}

if(tanglegram_break_segments < 0 | tanglegram_break_segments > 10) {
warning(paste("Unable to use the provided value for <tanglegram_break_segments>, using", 5))
tanglegram_break_segments = 5
if(!is.null(tanglegram_break_segments)){
if(tanglegram_break_segments < 0 | tanglegram_break_segments > 10) {
warning(paste("Unable to use the provided value for <tanglegram_break_segments>, using", 5))
tanglegram_break_segments = 5
}
}

if(max_blk_sz < 1000 | max_blk_sz > 100000) {
warning(paste("Unable to use the provided value for <max_blk_sz>, using", 10000, "...!If this value is causing the function to crash, consider reducing!..."))
max_blk_sz = 10000
Expand All @@ -162,6 +168,12 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path

# setup paths
if(!file.exists(dset)) dir.create(dset) # save everything in here

# Save console output as a text file
info_file = file.path(dset, paste("LDW_run_",format(Sys.time(), "%Y%m%d%H%M%S"), ".txt", sep = ""))
suppressWarnings(sink(file= NULL))
sink(info_file, split = T)

add_path = file.path(dset, "Additional_Outputs") # Additional Outputs
if(save_additional_outputs) {
if(!file.exists(add_path)) dir.create(file.path(dset, "Additional_Outputs"))
Expand Down Expand Up @@ -203,7 +215,11 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
cat(paste("All outputs will be saved to:", normalizePath(dset), "\n"))
cat(paste("\n *** Input paths *** \n\n"))
cat(paste("* Alignment:", aln_path, "\n"))
cat(paste("* GenBank Annotation:", gbk_path, "\n"))
if(!is.null(gbk_path)) {
cat(paste("* GenBank Annotation:", gbk_path, "\n"))
cat(paste("* Parser built using genbankr source (https://github.com/gmbecker/genbankr) \n"))
}
if(!is.null(gff3_path)) cat(paste("* GFF3 Annotation:", gff3_path, "\n"))
if(!is.null(snpeff_jar_path)) cat(paste("* SnpEff Annotations will be performed on short-range links. SnpEff path:", snpeff_jar_path, "\n"))

cat(paste("\n *** Parameters *** \n\n"))
Expand Down Expand Up @@ -356,6 +372,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path


if(nrow(sr_links) == 0){
suppressWarnings(sink(file= NULL)) ### output info to text file
stop("No potentially important sr_links were identified! Cannot continue analysis...")
}

Expand Down Expand Up @@ -383,25 +400,30 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
tophits = LDWeaver::read_TopHits(top_hits_path = tophits_path)
}

# Additional paths if annotations are requested
# tanglegram
tanglegram_path = file.path(dset, "SR_Tanglegram")
if(!file.exists(tanglegram_path)) dir.create(tanglegram_path)
# GWESExplorer
gwesexplorer_path = file.path(dset, "SR_GWESExplorer")
if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path)
# NetworkPlot
netplot_path = file.path(dset, "SR_network_plot.png")

# BLK8
cat("\n\n #################### BLOCK 9 #################### \n\n")
LDWeaver::create_tanglegram(tophits = tophits, gbk = gbk, gff = gff, tanglegram_folder = tanglegram_path, break_segments = tanglegram_break_segments)

if(!is.null(tanglegram_break_segments)){
# tanglegram
tanglegram_path = file.path(dset, "SR_Tanglegram")
if(!file.exists(tanglegram_path)) dir.create(tanglegram_path)
cat("\n\n #################### BLOCK 9 #################### \n\n")
LDWeaver::create_tanglegram(tophits = tophits, gbk = gbk, gff = gff, tanglegram_folder = tanglegram_path, break_segments = tanglegram_break_segments)
}
# BLK9
cat("\n\n #################### BLOCK 10 #################### \n\n")
LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path)
if(write_gwesExplorer){
# GWESExplorer
gwesexplorer_path = file.path(dset, "SR_GWESExplorer")
if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path)
cat("\n\n #################### BLOCK 10 #################### \n\n")
LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path)
}


# BLK10
# Additional paths if annotations are requested
# NetworkPlot
netplot_path = file.path(dset, "SR_network_plot.png")

cat("\n\n #################### BLOCK 11 #################### \n\n")
LDWeaver::create_network(tophits = tophits, netplot_path = netplot_path, plot_title = paste("Networks in short-range tophits for", dset))

Expand Down
1 change: 0 additions & 1 deletion R/createTanglegram.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#'
#' @importFrom htmlwidgets saveWidget
#' @importFrom stats cutree hclust dist
#' @importFrom genbankr cds
#' @importFrom plyr . ddply
#' @importFrom chromoMap chromoMap
#'
Expand Down
10 changes: 7 additions & 3 deletions R/estimateCDSDiversity.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#' Function to estimate the variation within each coding region, the output from this function
#' can be used to segment the genome into diversity-based clusters.
#'
#' @importFrom genbankr cds getSeq
#' @importFrom GenomicRanges start width end
#' @importFrom Matrix rowSums colSums
#' @importFrom data.table data.table setattr %between% .I
Expand All @@ -25,6 +24,9 @@
#' }
#' @export
estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, num_clusts_CDS = 3, clust_plt_path = NULL){
## NOTE: genbankr depreciation, removed the following import
# importFrom genbankr cds getSeq

# This method is only approximate, but much MUCH faster and easier on resources
# TODO: Include the higher accuracy function
t0 = Sys.time()
Expand All @@ -34,12 +36,14 @@ estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, nu

# extract the information we need
if(!is.null(gbk)){
cds_reg = genbankr::cds(gbk)
# cds_reg = genbankr::cds(gbk) # no longer exporting this function after genbankr depreciation
cds_reg = gbk@cds
starts = GenomicRanges::start(cds_reg)
widths = GenomicRanges::width(cds_reg)
ends = GenomicRanges::end(cds_reg)
# convert ref to a CharacterVector
ref = unlist(unname(strsplit(as.character(genbankr::getSeq(gbk)), '')))[snp.dat$POS]
# ref = unlist(unname(strsplit(as.character(genbankr::getSeq(gbk)), '')))[snp.dat$POS] # no longer exporting this function after genbankr deprecation
ref = unlist(unname(strsplit(as.character(gbk@sequence), '')))[snp.dat$POS]
} else if(!is.null(gff)){
gff_cds = gff$gff[tolower(gff$gff$type) == "cds", ]
starts = gff_cds$start
Expand Down
4 changes: 2 additions & 2 deletions R/io_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ cleanup = function(dset, delete_after_moving = F){
mv_success = c(mv_success, idx)
}

idx = c(grep("cds_var.rds", files), grep("hdw.rds", files), grep("parsed_gbk.rds", files), grep("snp_ACGTN.rds", files))
idx = c(grep("cds_var.rds", files), grep("hdw.rds", files), grep("parsed_gbk.rds", files), grep("parsed_gff3.rds", files), grep("snp_ACGTN.rds", files))
if(length(idx) > 0){
fldr = file.path(dset, "Additional_Outputs")
cleanup_support(files = file.path(dset, files[idx]), fldr)
Expand Down Expand Up @@ -298,7 +298,7 @@ cleanup = function(dset, delete_after_moving = F){
}

#### Temp folder ####
idx = c(grep("snpEff", files), grep("*.vcf", files), grep("*annotations.tsv", files), grep("*_links.tsv", files))
idx = c(grep("snpEff", files), grep("*.vcf", files), grep("*annotations.tsv", files), grep("*_links.tsv", files), grep("LDW_run_*", files))
if(length(idx) > 0){
fldr = file.path(dset, "Temp")
cleanup_support(files = file.path(dset, files[idx]), fldr)
Expand Down
16 changes: 7 additions & 9 deletions R/lr_analyser.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,14 @@ analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrli
# NOTE: spydrpick does not add clusters, add them from paint (requires cds_var)

if(SnpEff_Annotate == T) {
if( (is.null(gbk_path) & is.null(gff3_path)) | (!is.null(gbk_path) & !is.null(gff3_path)) ) stop("Either gbk_path or gff3_path must be provided.
To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL
if(!is.null(gff3_path) & is.null(ref_fasta_path)) stop("Reference fasta file must be provided for gff3 annoations.
To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL
if(is.null(snpeff_jar_path)) stop("You must specify <snpeff_jar_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
if(!file.exists(snpeff_jar_path)) stop(paste("<SnpEff.jar> not found at:", snpeff_jar_path, "please check the path provided"))
if(is.null(snpeff_jar_path)) stop("You must specify <snpeff_jar_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
if( (is.null(gbk_path) & is.null(gff3_path)) | (!is.null(gbk_path) & !is.null(gff3_path)) ) stop("Either gbk_path or gff3_path must be provided. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL
if(!is.null(gff3_path) & is.null(ref_fasta_path)) stop("Reference fasta file must be provided for gff3 annoations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL
# if(is.null(snpeff_jar_path)) stop("You must specify <snpeff_jar_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
# if(!file.exists(snpeff_jar_path)) stop(paste("<SnpEff.jar> not found at:", snpeff_jar_path, "please check the path provided"))
# if(is.null(snpeff_jar_path)) stop("You must specify <snpeff_jar_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
# if(is.null(gbk_path)) stop("You must specify <gbk_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
if(is.null(snp.dat)) stop("You must specify <gbk_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
if(is.null(cds_var)) stop("You must specify <gbk_path> for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F")
if(is.null(snp.dat)) stop("You must provide snp.dat to perform annotations.")
if(is.null(cds_var)) stop("You must specify cds_var to perform for annotations.")
}

# lr_links_path = "~/Desktop/LDWeaver_RUN/maela/lr_links.tsv"
Expand Down
Loading

0 comments on commit c3ee48a

Please sign in to comment.