Skip to content

Commit

Permalink
Improve Readme
Browse files Browse the repository at this point in the history
  • Loading branch information
Sudaraka88 authored and Sudaraka88 committed Apr 19, 2024
1 parent e354000 commit 67bf09a
Show file tree
Hide file tree
Showing 11 changed files with 57 additions and 32 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
.DS_Store
*.Rmd
Makevars
testscripts.R
8 changes: 4 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Encoding: UTF-8
LazyData: true
biocViews: Software
Depends: R (>= 4.0.0),
Imports:
Imports:
ape,
Biostrings,
chromoMap,
Expand All @@ -19,9 +19,7 @@ Imports:
fitdistrplus,
GenomicRanges,
GenomeInfoDb,
ggnewscale,
ggplot2,
ggtree,
ggraph,
grDevices,
heatmap3,
Expand All @@ -32,7 +30,6 @@ Imports:
MatrixExtra,
methods,
parallel,
phytools,
plyr,
RColorBrewer,
Rcpp,
Expand All @@ -42,6 +39,9 @@ Imports:
utils,
VariantAnnotation
Suggests:
phytools,
ggtree,
ggnewscale,
spam,
spam64
LinkingTo: Rcpp, RcppArmadillo
Expand Down
5 changes: 0 additions & 5 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ importFrom(S4Vectors,subjectHits)
importFrom(VariantAnnotation,VRanges)
importFrom(VariantAnnotation,makeVRangesFromGRanges)
importFrom(ape,read.gff)
importFrom(ape,read.tree)
importFrom(chromoMap,chromoMap)
importFrom(data.table,"%between%")
importFrom(data.table,.I)
Expand All @@ -76,7 +75,6 @@ importFrom(data.table,setattr)
importFrom(dplyr,`%>%`)
importFrom(dplyr,summarise)
importFrom(fitdistrplus,fitdist)
importFrom(ggnewscale,new_scale_fill)
importFrom(ggplot2,aes)
importFrom(ggplot2,facet_wrap)
importFrom(ggplot2,geom_point)
Expand All @@ -91,8 +89,6 @@ importFrom(ggraph,geom_edge_arc2)
importFrom(ggraph,geom_node_label)
importFrom(ggraph,ggraph)
importFrom(ggraph,scale_edge_colour_discrete)
importFrom(ggtree,ggtree)
importFrom(ggtree,gheatmap)
importFrom(grDevices,colorRampPalette)
importFrom(grDevices,png)
importFrom(heatmap3,heatmap3)
Expand All @@ -103,7 +99,6 @@ importFrom(methods,as)
importFrom(methods,is)
importFrom(methods,new)
importFrom(parallel,detectCores)
importFrom(phytools,midpoint.root)
importFrom(plyr,.)
importFrom(plyr,ddply)
importFrom(stats,coef)
Expand Down
13 changes: 8 additions & 5 deletions R/BacGWES.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#' @param ncores specify the number of cores to use for parallel processing (default = NULL), will auto detect if NULL
#' @param max_blk_sz specify maximum block size for MI computation (default = 10000), larger sizes require more RAM, range 1000 - 100000
#' @param save_additional_outputs specify whether to save outputs such as extracted SNPs and Hamming distance weights. Recommended for very large datasets to save time on re-computation (default = F)
#'
#' @param mega_dset To analyse megascale datasets using spam and spam64 packages, set to TRUE (default = F). Only do so if the normal analysis fails (since LDWeaver 1.5)
#'
#' @return Numeric Value 0 if successful (all generated outputs will be saved)
#'
Expand All @@ -50,7 +50,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6,
max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5,
write_gwesExplorer = T, multicore = T, max_blk_sz = 10000, ncores = NULL,
save_additional_outputs = F){
save_additional_outputs = F, mega_dset = F){

# Build blocks
# BLK1: Extract SNPs and create sparse Mx from MSA (fasta)
# BLK2: Parse GBK or GFF+REF
Expand All @@ -65,10 +66,10 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
# BLK11: Cleanup

#TODO: Provide the option to skip SNP extraction and use the whole provided alignment (redundant if pre-filtered)
#TODO: Add the option to provide genbank file without reference sequence
#TODO: Add the option to provide GFF file without reference sequence
#TODO: Count through blocks and automate the displayed BLOCK NUMBER
#TODO: genbankr is being droped from the newest bioconductor, add alternative (https://github.com/gmbecker/genbankr)
#TODO: Add Hamming Distance plot, can we have a SNP Tree + Hamming Distance weights to show population structure control?
#TODO: Drop ggtree dependency

#NOTE: SnpEff does not parse the GBK and GFF3 file from the same refseq reference genome the same way. There might be differences between annotations/tophits/etc.
# # Welcome message # #
Expand Down Expand Up @@ -158,6 +159,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
validate_ref_ann_lengths = F
}



# normalise_input_paths
aln_path = normalizePath(aln_path)
if(!is.null(gbk_path)) gbk_path = normalizePath(gbk_path)
Expand Down Expand Up @@ -318,7 +321,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
cat("\n\n #################### BLOCK 3 #################### \n\n")
if(!file.exists(cds_var_path)) {
cat("Estimating the variation in CDS \n")
cds_var = LDWeaver::estimate_variation_in_CDS(gbk = gbk, gff = gff, snp.dat = snp.dat, ncores = ncores, num_clusts_CDS = num_clusts_CDS, clust_plt_path = clust_plt_path)
cds_var = LDWeaver::estimate_variation_in_CDS(gbk = gbk, gff = gff, snp.dat = snp.dat, ncores = ncores, num_clusts_CDS = num_clusts_CDS, clust_plt_path = clust_plt_path, mega_dset = mega_dset)
if(save_additional_outputs){
saveRDS(cds_var, cds_var_path)
}
Expand Down
31 changes: 24 additions & 7 deletions R/estimateCDSDiversity.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#' @param gff output from parsing the gff3 file using LDWeaver::parse_gff_file()
#' @param num_clusts_CDS parition to genome into num_clusts_CDS regions using k-means (default = 3)
#' @param clust_plt_path specify path to save CDS variation plot
#' @param mega_dset set to TRUE for mega scale datasets (default = F)
#'
#' @return R list with CDS variation and allele distribution details
#'
Expand All @@ -23,7 +24,7 @@
#' cds_var <- estimate_variation_in_CDS(gbk, snp.dat, ncores = 10)
#' }
#' @export
estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, num_clusts_CDS = 3, clust_plt_path = NULL){
estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, num_clusts_CDS = 3, clust_plt_path = NULL, mega_dset = FALSE){
## NOTE: genbankr depreciation, removed the following import
# importFrom genbankr cds getSeq

Expand Down Expand Up @@ -57,12 +58,28 @@ estimate_variation_in_CDS = function(snp.dat, ncores, gbk = NULL, gff = NULL, nu
var_estimate = rep(NA, ncds)


variation = matrix(c(Matrix::rowSums(snp.dat$snp.matrix_A),
Matrix::rowSums(snp.dat$snp.matrix_C),
Matrix::rowSums(snp.dat$snp.matrix_G),
Matrix::rowSums(snp.dat$snp.matrix_T),
Matrix::rowSums(snp.dat$snp.matrix_N)),
ncol = snp.dat$nsnp, byrow = T)
if(mega_dset){ # Using SPAM
if(!requireNamespace("spam") & !requireNamespace("spam64")){
message("This feature requires spam and spam64 packages.")
return(invisible())
}
variation = matrix(c(spam::rowSums(snp.dat$snp.matrix_A),
spam::rowSums(snp.dat$snp.matrix_C),
spam::rowSums(snp.dat$snp.matrix_G),
spam::rowSums(snp.dat$snp.matrix_T),
spam::rowSums(snp.dat$snp.matrix_N)),
ncol = snp.dat$nsnp, byrow = T)

} else { # Using the Matrix package
variation = matrix(c(Matrix::rowSums(snp.dat$snp.matrix_A),
Matrix::rowSums(snp.dat$snp.matrix_C),
Matrix::rowSums(snp.dat$snp.matrix_G),
Matrix::rowSums(snp.dat$snp.matrix_T),
Matrix::rowSums(snp.dat$snp.matrix_N)),
ncol = snp.dat$nsnp, byrow = T)

}


# Generate a reference masking mx with 0 at reference allele
reference = matrix(rep(1, 5*snp.dat$nsnp), nrow = 5); .ACGTN2num(reference, ref, ncores)
Expand Down
4 changes: 2 additions & 2 deletions R/extractSNPs_mega.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ parse_fasta_alignment_mega <- function(aln_path, gap_freq = 0.15, maf_freq = 0.0
nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
snp.data$i_T = snp.data$j_T = snp.data$x_T = NULL

snp.matrix_N_spam <- spam::spam(list(i=snp.data$i_A, j=snp.data$j_A, values=as.logical(snp.data$x_A)),
snp.matrix_N_spam <- spam::spam(list(i=snp.data$i_N, j=snp.data$j_N, values=as.logical(snp.data$x_N)),
nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
snp.data = NULL

Expand Down Expand Up @@ -159,7 +159,7 @@ parse_fasta_SNP_alignment_mega <- function(aln_path, pos, gap_freq = 0.15, maf_f
nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
snp.data$i_T = snp.data$j_T = snp.data$x_T = NULL

snp.matrix_N_spam <- spam::spam(list(i=snp.data$i_A, j=snp.data$j_A, values=as.logical(snp.data$x_A)),
snp.matrix_N_spam <- spam::spam(list(i=snp.data$i_N, j=snp.data$j_N, values=as.logical(snp.data$x_N)),
nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
snp.data = NULL

Expand Down
11 changes: 7 additions & 4 deletions R/preptrees.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@
#' can be provided. A subset of manually chosen links can also be provided as a data.frame. The fasta/position files can be generated using LDWeaver::generate_Links_SNPS_fasta().
#' If metadata is available, they can be visualised along with the allele data in a separate panel.
#'
#' @importFrom ape read.tree
#' @importFrom phytools midpoint.root
#' @importFrom ggtree ggtree gheatmap
#' @importFrom ggplot2 ggsave scale_fill_viridis_d
#' @importFrom ggnewscale new_scale_fill
#'
#' @param tree_path Path to tree file (must be readable using ape::read.tree). tip.labels must exactly match the sequence names in the fasta and metadata files.
#'
Expand Down Expand Up @@ -56,6 +52,13 @@ view_tree = function(tree_path, perform_midpoint_rooting = T, metadata_df = NULL

# Change the file to give an optional fasta + pos file genereated from LDWeaver::snpdat_to_fa()

## There is a chance some people don't use LDWeaver to generate these plots, convert these requirements to suggestions
if(!requireNamespace("ape") & !requireNamespace("phytools") & !requireNamespace("ggtree") & !requireNamespace("ggnewscale")){
message("This feature requires ape, phytools, ggtrree and ggnewscale packages, please make sure they are installed.")
return(invisible())
}



# sanity checks and IO

Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
## Genomewide Co-selection and Epistasis in Bacteria <img src='images/icon.jpg' align="right" height="100" />

<!-- badges: start -->

[![R](https://github.com/Sudaraka88/LDWeaver/workflows/R-CMD-check/badge.svg)](https://github.com/Sudaraka88/LDWeaver/actions)
[![DOI](https://zenodo.org/badge/590009521.svg)](https://zenodo.org/badge/latestdoi/590009521)
[![LICESNSE](https://anaconda.org/bioconda/r-ldweaver/badges/license.svg)](https://spdx.org/licenses/GPL-3.0-or-later.html)
<!-- badges: end -->

# LDWeaver

## Genomewide Search for Evidence of Co-selection and Epistasis in Bacteria
## About

LDWeaver accepts a sequence alignment (fasta) and its reference annotation
(genbank or gff) as inputs and identifies linkage disequilibrium (LD) between
Expand Down
Binary file added images/icon.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 4 additions & 1 deletion man/LDWeaver.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/estimate_variation_in_CDS.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 67bf09a

Please sign in to comment.