From 232ad053c76ceed22311bcacea580d24081acf0e Mon Sep 17 00:00:00 2001 From: Sudaraka88 Date: Tue, 2 Jan 2024 14:48:15 +1100 Subject: [PATCH] NewYear_bugfix --- .github/workflows/r.yml | 3 ++- DESCRIPTION | 2 +- R/BacGWES.R | 42 ++++++++++++++++++++++++++++------------- R/io_functions.R | 4 ++-- R/lr_analyser.R | 16 +++++++--------- 5 files changed, 41 insertions(+), 26 deletions(-) diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index b1e4b4e..9acbcca 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -22,7 +22,7 @@ jobs: config: - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - #- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} - {os: ubuntu-latest, r: 'oldrel-1'} @@ -49,3 +49,4 @@ jobs: - uses: r-lib/actions/check-r-package@v2 with: upload-snapshots: true + error-on: '"error"' diff --git a/DESCRIPTION b/DESCRIPTION index c6ba8d7..658f64b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: LDWeaver Type: Package Title:Genomewide Epistasis Analysis on Bacteria -Version: 1.3.1 +Version: 1.3.2 Authors@R: person("Sudaraka", "Mallawaarachchi", email = "smallawaarachchi@gmail.com", role = c("aut", "cre")) Maintainer: Sudaraka Mallawaarachchi Description:Perform genomewide epistasis analysis by evaluating the LD structure in bacteria. diff --git a/R/BacGWES.R b/R/BacGWES.R index 4c8e45a..074c00d 100644 --- a/R/BacGWES.R +++ b/R/BacGWES.R @@ -30,6 +30,7 @@ #' modelling since all links are used. However, setting a threshold > 2 will generally reduce the memory usage, plotting time (default = 3, i.e. corresponding to p = 0.001), #' and run time for ARACNE. If all links are required to be returned, set to 0 (i.e. corresponding to p = 1), range 0 - 5 #' @param tanglegram_break_segments specify the number of genome segments to prepare - one tanglegram per segment (default = 5), range 1 - 10. Set NULL to skip tanglegram +#' @param write_gwesExplorer specify whether output for GWESExplorer is required (default = T) #' @param multicore specify whether to use parallel processing (default = T) #' @param ncores specify the number of cores to use for parallel processing (default = NULL), will auto detect if NULL #' @param max_blk_sz specify maximum block size for MI computation (default = 10000), larger sizes require more RAM, range 1000 - 100000 @@ -48,7 +49,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path gap_freq = 0.15, maf_freq = 0.01, hdw_threshold = 0.1, perform_SR_analysis_only = F, SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6, max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5, - multicore = T, max_blk_sz = 10000, ncores = NULL, save_additional_outputs = F){ + write_gwesExplorer = T, multicore = T, max_blk_sz = 10000, ncores = NULL, + save_additional_outputs = F){ # Build blocks # BLK1: Extract SNPs and create sparse Mx from MSA (fasta) # BLK2: Parse GBK or GFF+REF @@ -65,6 +67,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path #TODO: Provide the option to skip SNP extraction and use the whole provided alignment (redundant if pre-filtered) #TODO: Add the option to provide genbank file without reference sequence #TODO: Count through blocks and automate the displayed BLOCK NUMBER + #TODO: genbankr is being droped from the newest bioconductor, add alternative (https://github.com/gmbecker/genbankr) + #TODO: Add Hamming Distance plot, can we have a SNP Tree + Hamming Distance weights to show population structure control? #NOTE: SnpEff does not parse the GBK and GFF3 file from the same refseq reference genome the same way. There might be differences between annotations/tophits/etc. # # Welcome message # # @@ -164,6 +168,12 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path # setup paths if(!file.exists(dset)) dir.create(dset) # save everything in here + + # Save console output as a text file + info_file = file.path(dset, paste("LDW_run_",format(Sys.time(), "%Y%m%d%H%M%S"), ".txt", sep = "")) + suppressWarnings(sink(file= NULL)) + sink(info_file, split = T) + add_path = file.path(dset, "Additional_Outputs") # Additional Outputs if(save_additional_outputs) { if(!file.exists(add_path)) dir.create(file.path(dset, "Additional_Outputs")) @@ -205,7 +215,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path cat(paste("All outputs will be saved to:", normalizePath(dset), "\n")) cat(paste("\n *** Input paths *** \n\n")) cat(paste("* Alignment:", aln_path, "\n")) - cat(paste("* GenBank Annotation:", gbk_path, "\n")) + if(!is.null(gbk_path)) cat(paste("* GenBank Annotation:", gbk_path, "\n")) + if(!is.null(gff3_path)) cat(paste("* GFF3 Annotation:", gff3_path, "\n")) if(!is.null(snpeff_jar_path)) cat(paste("* SnpEff Annotations will be performed on short-range links. SnpEff path:", snpeff_jar_path, "\n")) cat(paste("\n *** Parameters *** \n\n")) @@ -358,6 +369,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path if(nrow(sr_links) == 0){ + suppressWarnings(sink(file= NULL)) ### output info to text file stop("No potentially important sr_links were identified! Cannot continue analysis...") } @@ -385,26 +397,30 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path tophits = LDWeaver::read_TopHits(top_hits_path = tophits_path) } - # Additional paths if annotations are requested - # tanglegram - tanglegram_path = file.path(dset, "SR_Tanglegram") - if(!file.exists(tanglegram_path)) dir.create(tanglegram_path) - # GWESExplorer - gwesexplorer_path = file.path(dset, "SR_GWESExplorer") - if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path) - # NetworkPlot - netplot_path = file.path(dset, "SR_network_plot.png") # BLK8 if(!is.null(tanglegram_break_segments)){ + # tanglegram + tanglegram_path = file.path(dset, "SR_Tanglegram") + if(!file.exists(tanglegram_path)) dir.create(tanglegram_path) cat("\n\n #################### BLOCK 9 #################### \n\n") LDWeaver::create_tanglegram(tophits = tophits, gbk = gbk, gff = gff, tanglegram_folder = tanglegram_path, break_segments = tanglegram_break_segments) } # BLK9 - cat("\n\n #################### BLOCK 10 #################### \n\n") - LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path) + if(write_gwesExplorer){ + # GWESExplorer + gwesexplorer_path = file.path(dset, "SR_GWESExplorer") + if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path) + cat("\n\n #################### BLOCK 10 #################### \n\n") + LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path) + } + # BLK10 + # Additional paths if annotations are requested + # NetworkPlot + netplot_path = file.path(dset, "SR_network_plot.png") + cat("\n\n #################### BLOCK 11 #################### \n\n") LDWeaver::create_network(tophits = tophits, netplot_path = netplot_path, plot_title = paste("Networks in short-range tophits for", dset)) diff --git a/R/io_functions.R b/R/io_functions.R index fc814b0..050bd6d 100644 --- a/R/io_functions.R +++ b/R/io_functions.R @@ -250,7 +250,7 @@ cleanup = function(dset, delete_after_moving = F){ mv_success = c(mv_success, idx) } - idx = c(grep("cds_var.rds", files), grep("hdw.rds", files), grep("parsed_gbk.rds", files), grep("snp_ACGTN.rds", files)) + idx = c(grep("cds_var.rds", files), grep("hdw.rds", files), grep("parsed_gbk.rds", files), grep("parsed_gff3.rds", files), grep("snp_ACGTN.rds", files)) if(length(idx) > 0){ fldr = file.path(dset, "Additional_Outputs") cleanup_support(files = file.path(dset, files[idx]), fldr) @@ -298,7 +298,7 @@ cleanup = function(dset, delete_after_moving = F){ } #### Temp folder #### - idx = c(grep("snpEff", files), grep("*.vcf", files), grep("*annotations.tsv", files), grep("*_links.tsv", files)) + idx = c(grep("snpEff", files), grep("*.vcf", files), grep("*annotations.tsv", files), grep("*_links.tsv", files), grep("LDW_run_*", files)) if(length(idx) > 0){ fldr = file.path(dset, "Temp") cleanup_support(files = file.path(dset, files[idx]), fldr) diff --git a/R/lr_analyser.R b/R/lr_analyser.R index 3641b6d..90f1546 100644 --- a/R/lr_analyser.R +++ b/R/lr_analyser.R @@ -44,16 +44,14 @@ analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrli # NOTE: spydrpick does not add clusters, add them from paint (requires cds_var) if(SnpEff_Annotate == T) { - if( (is.null(gbk_path) & is.null(gff3_path)) | (!is.null(gbk_path) & !is.null(gff3_path)) ) stop("Either gbk_path or gff3_path must be provided. - To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL - if(!is.null(gff3_path) & is.null(ref_fasta_path)) stop("Reference fasta file must be provided for gff3 annoations. - To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL - if(is.null(snpeff_jar_path)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") - if(!file.exists(snpeff_jar_path)) stop(paste(" not found at:", snpeff_jar_path, "please check the path provided")) - if(is.null(snpeff_jar_path)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") + if( (is.null(gbk_path) & is.null(gff3_path)) | (!is.null(gbk_path) & !is.null(gff3_path)) ) stop("Either gbk_path or gff3_path must be provided. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL + if(!is.null(gff3_path) & is.null(ref_fasta_path)) stop("Reference fasta file must be provided for gff3 annoations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # only one of gbk or gff can be NULL + # if(is.null(snpeff_jar_path)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") + # if(!file.exists(snpeff_jar_path)) stop(paste(" not found at:", snpeff_jar_path, "please check the path provided")) + # if(is.null(snpeff_jar_path)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") # if(is.null(gbk_path)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") - if(is.null(snp.dat)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") - if(is.null(cds_var)) stop("You must specify for annotations. To run without annotations, re-run analyse_long_range_links() with SnpEff_Annotate = F") + if(is.null(snp.dat)) stop("You must provide snp.dat to perform annotations.") + if(is.null(cds_var)) stop("You must specify cds_var to perform for annotations.") } # lr_links_path = "~/Desktop/LDWeaver_RUN/maela/lr_links.tsv"