Skip to content

Commit

Permalink
bundle_snpEff
Browse files Browse the repository at this point in the history
  • Loading branch information
Sudaraka88 authored and Sudaraka88 committed Oct 31, 2023
1 parent 3919515 commit 9bed364
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 47 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: LDWeaver
Type: Package
Title:Genomewide Epistasis Analysis on Bacteria
Version: 1.2.0
Version: 1.3
Authors@R: person("Sudaraka", "Mallawaarachchi", email = "[email protected]", role = c("aut", "cre"))
Maintainer: Sudaraka Mallawaarachchi <[email protected]>
Description:Perform genomewide epistasis analysis by evaluating the LD structure in bacteria.
Expand Down
26 changes: 17 additions & 9 deletions R/BacGWES.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,9 @@
#' maf_freq filter. Eg: Under default filter values, a site with allele frequencies A:0.85, C:0.0095, N:0.1405 will be respectively dropped and allowed by 'default' and 'relaxed' methods.
#' @param gap_freq sites with a gap frequency >gap_greq will be dropped (default = 0.15)
#' @param maf_freq sites with a minor allele frequency <maf_freq will be dropped (default = 0.01)
#' @param snpeff_jar_path path to <snpEff.jar>. If unavailable or if annotations are not required, set SnpEff_Annotate = F
#' @param hdw_threshold Hamming distance similarity threshold (default = 0.1, i.e. 10\%) - lower values will force stricter population structure control at the cost of masking real signal.
#' @param perform_SR_analysis_only specify whether to only perform the short range link analysis (default = FALSE)
#' @param SnpEff_Annotate specify whether to perform annotations using SnpEff
#' @param SnpEff_Annotate specify whether to perform annotations using SnpEff (default = TRUE)
#' @param sr_dist links less than <sr_dist> apart are considered 'short range' (default = 20000), range 1000 - 25000 bp.
#' @param lr_retain_links specify the maximum number of long-range MI links to retain (default = 1000000) - in each block, only a top subset of links will be saved
#' @param max_tophits specify the maximum number of short range links to save as <sr_tophits.tsv>. Note: all short-range links will be annotated (and saved separately),
Expand All @@ -46,24 +45,28 @@
#' @export
LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path = NULL, gff3_path = NULL,
ref_fasta_path = NULL, validate_ref_ann_lengths = T, snp_filt_method = "default",
gap_freq = 0.15, maf_freq = 0.01, snpeff_jar_path = NULL, hdw_threshold = 0.1,
perform_SR_analysis_only = F, SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6,
gap_freq = 0.15, maf_freq = 0.01, hdw_threshold = 0.1, perform_SR_analysis_only = F,
SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6,
max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5,
multicore = T, max_blk_sz = 10000, ncores = NULL, save_additional_outputs = F){
# Build blocks
# BLK1: Extract SNPs and create sparse Mx from MSA (fasta)
# BLK2: Parse GBK
# BLK2: Parse GBK or GFF+REF
# BLK3: Estimate diversity within each CDS, cluster and paint < # possible inputs on methods>
# BLK4: Compute Hamming Distance weights
# BLK5: Compute MI between all links, sr_links model fitter, ARACNE
# BLK6: GWES_plots
# BLK7: Snpeff annotation pipeline, dtermine tophits
# BLK8: Tanglegram (depends: chromoMap)
# BLK9: GWESExplorer (depends: GWESExplorer)
# BLK6: Genomewide LD Map
# BLK7: GWES_plots
# BLK8: Snpeff annotation pipeline, determine tophits
# BLK9: Tanglegram (depends: chromoMap)
# BLK10: GWESExplorer (depends: GWESExplorer)
# BLK11: Cleanup

#TODO: Add the option to provide genbank file without reference sequence
#TODO: Count through blocks and automate the displayed BLOCK NUMBER

#NOTE: SnpEff does not parse the GBK and GFF3 file from the same refseq reference genome the same way. There might be differences between annotations/tophits/etc.

# # Welcome message # #

# Sanity checks
Expand All @@ -72,8 +75,12 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
if(!is.null(gff3_path) & is.null(ref_fasta_path)) stop("Reference fasta file must be provided for gff3 annoations") # only one of gbk or gff can be NULL

if(SnpEff_Annotate == T) {
# Added snpEff to inst/extdata
snpeff_jar_path = system.file("extdata", "snpEff.jar", package = "LDWeaver")
######################################## These checks must be unnecessary now ########################################
if(is.null(snpeff_jar_path)) stop("<snpeff_jar_path> must be provided for annotations. To run without annotations, set SnpEff_Annotate = F")
if(!file.exists(snpeff_jar_path)) stop(paste("<SnpEff.jar> not found at:", snpeff_jar_path, "please check the path provided"))
######################################################################################################################
order_links = F # sr_links should be ordered at the end after adding annotations
} else {
order_links = T # sr_links will be ordered and saved without annotations
Expand Down Expand Up @@ -189,6 +196,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
######## Welcome message ########
{
timestamp()
cat("\n ***** This is LDWeaver", as.character(packageVersion(pkg = "LDWeaver")), " *****")
if(ncores > 1) cat(paste("\n\n Performing GWES analysis on:", dset, " - using", ncores, "cores\n\n"))
if(ncores == 1) cat(paste("\n\n Performing GWES analysis on:", dset, "\n\n"))
if(perform_SR_analysis_only) cat("Only short-range analysis requested. \n")
Expand Down
58 changes: 25 additions & 33 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ LDWeaver accepts a sequence alignment (fasta) and its reference annotation
pairs of variants (links) that is unusually high given the genomic distance
between the pair. This high LD could be the result of co-selection or epistasis.
Approximate statistical significance is used to rank outlier links and the
output is reported in `tsv` format, along with several other helpful figures.
Additionally, LDWeaver has functions assist the detection of genomic regions
output is reported in `tsv` format, along with several other helpful annotations and figures.
Additionally, LDWeaver has functions to assist the detection of genomic regions
that have potentially undergone co-selection or epistasis. LDWeaver `tsv` output
can be directly used as input for
<a href="https://github.com/jurikuronen/GWES-Explorer">GWESExplorer</a>
Expand Down Expand Up @@ -140,18 +140,10 @@ folder called `sample`, which should be created in the current working directory
## Performing Annotations

Additionally, LDWeaver has an interface to perform detailed annotations
using
<a href="https://pcingola.github.io/SnpEff/" target="_blank">SnpEff</a>.
Once downloaded, set the two options: `SnpEff_Annotate=T` and
`snpeff_jar_path=<path_to_snpEff.jar_file>`

> **Note** Since the genbank annotation file is provided, LDWeaver only requires
> the path to the \<snpEff.jar\> file. You can download this by visiting the
> <a href="https://pcingola.github.io/SnpEff/" target="_blank">SnpEff github page</a>.
Once set, this will create these outputs in the \<dset\> folder. Note that `X` in
the following outputs refer to **sr** (short range) or **lr** (long range).
By default, LDWeaver performs detailed annotations on all link SNPs using
<a href="https://pcingola.github.io/SnpEff/" target="_blank">SnpEff</a>.
This will create the following outputs in \<dset\>. Note that `X` here
refers to **sr** (short range) or **lr** (long range).

- Outputs

Expand All @@ -168,7 +160,7 @@ the following outputs refer to **sr** (short range) or **lr** (long range).
<a href="https://github.com/jurikuronen/GWES-Explorer" target="_blank">GWESExplorer</a>
(X = sr,lr).

> **Note** The default srp_cutoff is 3 (i.e. p=0.001). Short-range links
> **Note** The default srp_cutoff is 3 (i.e., p=0.001). Short-range links
> with p\>0.001 are automatically discarded, this can be modified using
> the \<srp_cutoff\> option. The default max_tophits value is 250, this
> can be modified using the \<max_tophits\> option.
Expand All @@ -195,17 +187,16 @@ To cite LDWeaver please use: Mallawaarachchi, Sudaraka et al. Detecting co-selec

## Detailed Workthrough using Real Data

The following analysis demonstrates some of the current options available in
The following analysis demonstrates most of the options available in
LDWeaver. The alignment with 616 *S. pnuemoniae* genomes is available
<a href="https://cloudstor.aarnet.edu.au/plus/s/KBRnIt1H6XZ2XFR" target="_blank">here</a>, and
the same sample.gbk annotation was used to generate this alignment. This annotation is also available
<a href="https://www.ncbi.nlm.nih.gov/nuccore/NC_011900.1?report=gbwithparts&log$=seqview" target="_blank">here</a>.
the same sample.gbk annotation was used to generate this alignment (also available <a href="https://www.ncbi.nlm.nih.gov/nuccore/NC_011900.1?report=gbwithparts&log$=seqview" target="_blank">here</a>.

For this example, it is assumed that the current working directory is set to
`~/LDWeaver_run` and the
<a href="https://cloudstor.aarnet.edu.au/plus/s/KBRnIt1H6XZ2XFR" target="_blank">alignment</a>
and \<snpEff.jar\> file (see [above](#performing-annotations)) are available in the same folder. Please note that file paths here are written for Linux and macOS operating systems,
windows users will need to modify as required.
is available in the same folder. Please note that file paths here are written for Linux/macOS operating systems,
windows users will need to modify as needed.

The following few lines of code can perform the complete LDWeaver analysis.

Expand All @@ -218,19 +209,19 @@ setwd("~/LDWeaver_run")
dset <- "msch"
aln_path <- "spn23f_msch.aln.gz"
gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver")
snpeff_jar_path <- "snpEff.jar"

LDWeaver::LDWeaver(dset = dset,
aln_path = aln_path,
gbk_path = gbk_path,
snpeff_jar_path = snpeff_jar_path,
save_additional_outputs = T)
```

While the `LDWeaver::LDWeaver()` one-liner function is generally
versatile for most analyses, it is possible to write a customised
pipeline using available functions. For a full list of available functions
and options, see: `help(package="LDWeaver")`.
`LDWeaver::LDWeaver()` one-liner is versatile for most
analyses. If previously created outputs are available in \<dset\>, this
function will load those instead of repeating possibly time and resource heavy analysis.

It is also possible to write customised pipelines using available functions. For a full list of available functions
and options, run: `help(package="LDWeaver")`.

``` r
library(LDWeaver)
Expand All @@ -239,7 +230,6 @@ dir.create(dset) # folder to save outputs

aln_path <- "~/LDWeaver_run/spn23f_msch.aln.gz"
gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver")
snpeff_jar_path <- "~/LDWeaver_run/snpEff.jar"
ncores = parallel::detectCores()

snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path) # parse the alignment and extract SNPs
Expand Down Expand Up @@ -276,8 +266,8 @@ LDWeaver::make_gwes_plots(sr_links = sr_links, plt_folder = dset)
``` r
# Identify the top hits by performing snpEff annotations
tophits = LDWeaver::perform_snpEff_annotations(dset_name = dset, annotation_folder = file.path(getwd(), dset),
snpeff_jar = snpeff_jar_path, gbk = gbk, gbk_path = gbk_path,
cds_var = cds_var, links_df = sr_links, snp.dat = snp.dat,
gbk = gbk, gbk_path = gbk_path, cds_var = cds_var,
links_df = sr_links, snp.dat = snp.dat,
tophits_path = "msch/sr_tophits.tsv")
```

Expand Down Expand Up @@ -311,8 +301,7 @@ Next step is to analyse the long range links
# Analyse long range links
LDWeaver::analyse_long_range_links(dset = dset, lr_links_path = "msch/lr_links.tsv",
sr_links_path = "msch/sr_links.tsv", SnpEff_Annotate = T,
snp.dat = snp.dat, snpeff_jar_path = snpeff_jar_path,
gbk_path = gbk_path, cds_var = cds_var)
snp.dat = snp.dat, gbk_path = gbk_path, cds_var = cds_var)
```
![](inst/sup/lr_gwes.png)

Expand All @@ -339,10 +328,13 @@ sites and their magnitude can be generated using:

``` r
# Generate the Network Plot for pbp genes
LDWeaver::create_network(LDWeaver::create_network_for_gene("pbp",

network = LDWeaver::create_network_for_gene("pbp",
sr_annotated_path = "msch/Annotated_links/sr_links_annotated.tsv",
lr_annotated_path = "msch/Annotated_links/lr_links_annotated.tsv",
level = 2),
level = 2)

LDWeaver::create_network(network,
plot_title = "pbp network",
netplot_path = "msch/pbp_network.png",
plot_w = 2000, plot_h = 2000)
Expand Down
Binary file added inst/extdata/snpEff.jar
Binary file not shown.
5 changes: 1 addition & 4 deletions man/LDWeaver.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 9bed364

Please sign in to comment.