diff --git a/.travis.yml b/.travis.yml index cc7eaee3..aea091e5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,7 @@ before_install: - sudo apt-get -qq update - sudo apt-get install xdotool - sudo apt-get install texlive-latex-extra + - sudo apt-get install libgfortran3 install: - bash install.sh script: bash etc/tests/test.sh diff --git a/.version b/.version index eeab19d3..8031930e 100644 --- a/.version +++ b/.version @@ -1 +1 @@ -v0.9.8 +v0.9.9 diff --git a/README.md b/README.md index 6946bfc2..db79ecdf 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,6 @@ conda activate iguide iguide list_samples configs/simulation.config.yml # Create test analysis directory -# (The simulation configuration file is used by default and does not need to be specified) iguide setup configs/simulation.config.yml @@ -71,19 +70,18 @@ iguide setup configs/simulation.config.yml iguide run configs/simulation.config.yml -- -np iguide run configs/simulation.config.yml -- --latency-wait 30 -cat analysis/simulation/output/unique_sites.simulation.csv -# Processing will complete with a report, but if additional analyses are required, -# you can reevaluate the 'incorp_sites' object. Multiple objects can be evaluated -# together, just include the run files. +# Processing will complete with several reports, but if additional analyses are required, +# you can re-evaluate a run by its config file. Multiple runs can be evaluated together, +# just include multiple config files. -iguide eval analysis/simulation/output/incorp_sites.simulation.rds \ +iguide eval configs/simulation.config.yml \ -o analysis/simulation/output/iguide.eval.simulation.test.rds \ -s sampleInfo/simulation.supp.csv # After evaluation, generate a report in a different format than standard. # Additionally the evaluation and report generation step can be combined using -# config file(s) as inputs for the 'report' subcommand. +# config file(s) as inputs for the 'report' subcommand (using the -c flag instead of -e). iguide report -e analysis/simulation/output/iguide.eval.simulation.test.rds \ -o analysis/simulation/reports/report.simulation.pdf \ @@ -91,7 +89,7 @@ iguide report -e analysis/simulation/output/iguide.eval.simulation.test.rds \ -t pdf # When you are all finished and ready to archive / remove excess files, a minimal configuration -# can be achived with the 'clean' subcommand. +# can be achieved with the 'clean' subcommand. iguide clean configs/simulation.config.yml @@ -106,6 +104,18 @@ conda deactivate ### Changelog: +**v0.9.9 (June 10th, 2019)** + +* Modified the assimilate + evaluate workflow + + Assimilate now only includes reference genome data, meaning a cleaner intermediate file + + Evaluate will now handle ref. gene sets and further analysis + + This increases the modularity and consistancy of the workflow +* Revised the iGUIDE Report format to be more informational and clearer +* Revised a bit of the workflow to make reprocessing smoother +* Updated BLAT coupling script to be more memory efficient +* Fixed TravisCI testing! +* Changed stat workflow, now restarting analysis won't init a total reproc. + **v0.9.8 (April 19th, 2019)** * iGUIDE can now support non-Cas9 nucleases as well! diff --git a/Snakefile b/Snakefile index d3248012..ded32421 100644 --- a/Snakefile +++ b/Snakefile @@ -55,7 +55,10 @@ if not os.path.isdir(ROOT_DIR): # Check for sequence file paths if not os.path.isdir(config["Seq_Path"]): raise SystemExit("Path to sequencing files is not found (Seq_Path). Check your config file.") - + +# Check for config symlink to check proper run directory setup +if not os.path.isfile(RUN_DIR + "/config.yml"): + raise SystemExit("Path to symbolic config is not present. Check to make sure you've run 'iguide setup' first.") # Default params if not included in config if not "maxNcount" in config: @@ -100,7 +103,6 @@ if not "reportMB" in config: # Target Rules rule all: input: - uniq_sites=RUN_DIR + "/output/unique_sites." + RUN + ".csv.gz", incorp_sites=RUN_DIR + "/output/incorp_sites." + RUN + ".rds", report=RUN_DIR + "/reports/report." + RUN + ".html", summary=RUN_DIR + "/reports/summary." + RUN + ".txt", diff --git a/docs/pages/changelog.rst b/docs/pages/changelog.rst index 15762dc9..72d44b25 100644 --- a/docs/pages/changelog.rst +++ b/docs/pages/changelog.rst @@ -6,6 +6,20 @@ ChangeLog ========= +**v0.9.9 (June 10th, 2019)** + +* Modified the assimilate + evaluate workflow + + - Assimilate now only includes reference genome data, meaning a cleaner intermediate file + - Evaluate will now handle ref. gene sets and further analysis + - This increases the modularity and consistancy of the workflow + +* Revised the iGUIDE Report format to be more informational and clearer +* Revised a bit of the workflow to make reprocessing smoother +* Updated BLAT coupling script to be more memory efficient +* Fixed TravisCI testing! +* Changed stat workflow, now restarting analysis won't initiate a total reprocessing. + **v0.9.8 (April 19th, 2019)** * iGUIDE can now support non-Cas9 nucleases as well! diff --git a/docs/pages/quickstart.rst b/docs/pages/quickstart.rst index d020237d..3705f635 100644 --- a/docs/pages/quickstart.rst +++ b/docs/pages/quickstart.rst @@ -123,50 +123,47 @@ terminal, so you can see what snakemake is about to perform. Next, the test data is moved to the input directory underneath the new test run directory. Then the entirety of processing can start.:: - # After constructing the config file and having reference files - # (i.e. sampleinfo). You can check the samples associated with the run. - + # If conda is not in your path ... + + source ${HOME}/miniconda3/etc/profile.d/conda.sh + + # Activate iguide environment + + conda activate iguide + + # After constructing the config file and having reference files (i.e. sampleinfo) + # You can check the samples associated with the run. + iguide list_samples configs/simulation.config.yml # Create test analysis directory - # (The simulation configuration file is used by default and does not need to - # be specified) - + iguide setup configs/simulation.config.yml # Process a simulation dataset iguide run configs/simulation.config.yml -- -np iguide run configs/simulation.config.yml -- --latency-wait 30 - zcat analysis/simulation/output/unique_sites.simulation.csv.gz - # Processing will complete with a report, but if additional analyses are - # required, you can reevaluate the 'incorp_sites' object. Multiple objects - # can be evaluated together, just include the run files. + # Processing will complete with several reports, but if additional analyses are required, + # you can re-evaluate a run by its config file. Multiple runs can be evaluated together, + # just include multiple config files. - iguide eval analysis/simulation/output/incorp_sites.simulation.rds \ + iguide eval configs/simulation.config.yml \ -o analysis/simulation/output/iguide.eval.simulation.test.rds \ -s sampleInfo/simulation.supp.csv # After evaluation, generate a report in a different format than standard. - # Additionally the evaluation and report generation step can be combined using - # config file(s) as inputs for the 'report' subcommand. For PDF output, you'll - # need to verify that your system has the correct latex-based software - # support, such as 'texlive'. + # Additionally the evaluation and report generation step can be combined using + # config file(s) as inputs for the 'report' subcommand (using the -c flag instead of -e). iguide report -e analysis/simulation/output/iguide.eval.simulation.test.rds \ -o analysis/simulation/reports/report.simulation.pdf \ -s sampleInfo/simulation.supp.csv \ -t pdf - # If you are looking for a quick and consise report of the output, use the - # 'summary' subcommand with input of either a config file(s) or a single - # evaluation file, generated by the 'eval' subcommand. - - iguide summary -e analysis/simulation/output/iguide.eval.simulation.test.rds - - # When you are all finished and ready to archive / remove excess files, a - # minimal configuration can be achived with the 'clean' subcommand. + # When you are all finished and ready to archive / remove excess files, a minimal configuration + # can be achieved with the 'clean' subcommand. iguide clean configs/simulation.config.yml @@ -174,6 +171,9 @@ entirety of processing can start.:: iguide clean configs/simulation.config.yml --remove_proj + # Deactivate the environment + + conda deactivate Uninstall --------- diff --git a/etc/tests/simulation.digests.yml b/etc/tests/simulation.digests.yml index 23a244e6..0b987669 100644 --- a/etc/tests/simulation.digests.yml +++ b/etc/tests/simulation.digests.yml @@ -1,16 +1,16 @@ # Simulation output check sums (md5) file1 : - name : "unique_sites.simulation.csv" - path : "analysis/simulation/output/unique_sites.simulation.csv.gz" - md5 : "c4cd8f98201fe8dd613f4eab9287cc78" - -file2 : name : "incorp_sites.simulation.rds" path : "analysis/simulation/output/incorp_sites.simulation.rds" - md5 : "1c92defd2037daf5ad540157ff523663" + md5 : "dbfc152e8e3ee129c11ab953ed8aa9de" + +file2 : + name : "stats.core.simulation.csv" + path : "analysis/simulation/output/stats.core.simulation.csv" + md5 : "b593a0e58c97f48a5b367184bd440ad3" file3 : - name : "stats.simulation.csv" - path : "analysis/simulation/output/stats.simulation.csv" - md5 : "c3f087be1a092d1d7dfce053936fb238" + name : "stats.eval.simulation.csv" + path : "analysis/simulation/output/stats.eval.simulation.csv" + md5 : "57158a4826685a8024b3281284ac42b6" diff --git a/etc/tests/test.sh b/etc/tests/test.sh index 1a99bdd3..e64b17a8 100755 --- a/etc/tests/test.sh +++ b/etc/tests/test.sh @@ -15,14 +15,15 @@ conda activate ${__IGUIDE_ENV} # Create test analysis directory iguide setup configs/simulation.config.yml -# Generate test DAG graph +# Generate test DAG graph and run iguide run configs/simulation.config.yml -- -np iguide run configs/simulation.config.yml -- --dag --nolock | dot -Tsvg > \ analysis/simulation/reports/simulation.dag.svg -iguide run configs/simulation.config.yml -- -p -w 30 --nolock --cores ${__CORES} +iguide run configs/simulation.config.yml -- -p -w 30 --notemp --nolock --cores ${__CORES} +# Evaluate and report out using a different metadata set iguide eval configs/simulation.config.yml \ -o analysis/simulation/reports/iguide.eval.simulation.test.rds \ -s sampleInfo/simulation.supp.csv diff --git a/rules/arch.rules b/rules/arch.rules index 4923f336..a2d7c6bf 100644 --- a/rules/arch.rules +++ b/rules/arch.rules @@ -2,7 +2,7 @@ # Architecture Rules # Related to setting up analysis directories and consolidating data -rule generate_stat_matrix: +rule core_stat_matrix: input: demulti=RUN_DIR + "/process_data/" + RUN + ".demulti.stat", trimR1=expand( @@ -22,22 +22,29 @@ rule generate_stat_matrix: RUN_DIR + "/process_data/{sample}.align.stat", sample=SAMPLES), assim=RUN_DIR + "/process_data/" + RUN + ".assim.stat" output: - RUN_DIR + "/output/stats." + RUN + ".csv" + RUN_DIR + "/output/stats.core." + RUN + ".csv" params: - dir=RUN_DIR + "/process_data", tool=ROOT_DIR + "/tools/rscripts/collect_stats.R" resources: mem_mb = lambda wildcards, attempt: attempt * config["defaultMB"] - shell: - """ - Rscript {params.tool} {params.dir} -o {output} - """ + shell: "Rscript {params.tool} {input} -o {output}" + +rule eval_stat_matrix: + input: RUN_DIR + "/process_data/" + RUN + ".eval.stat" + output: RUN_DIR + "/output/stats.eval." + RUN + ".csv" + params: + tool=ROOT_DIR + "/tools/rscripts/collect_stats.R" + resources: + mem_mb = lambda wildcards, attempt: attempt * config["defaultMB"] + shell: "Rscript {params.tool} {input} -o {output}" rule gen_stat_report: - input: RUN_DIR + "/output/stats." + RUN + ".csv" + input: + core = RUN_DIR + "/output/stats.core." + RUN + ".csv", + eval = RUN_DIR + "/output/stats.eval." + RUN + ".csv" output: RUN_DIR + "/reports/runstats." + RUN + ".html" params: - tool = ROOT_DIR + "/tools/rscripts/write_stat_report.R", + tool = ROOT_DIR + "/tools/rscripts/generate_stat_report.R", config = RUN_DIR + "/" + "config.yml" log: RUN_DIR + "/logs/" + RUN + ".runstats.log" resources: diff --git a/rules/consol.rules b/rules/consol.rules index cc2079f5..ae44947a 100644 --- a/rules/consol.rules +++ b/rules/consol.rules @@ -17,6 +17,6 @@ rule consolidate: shell: """ Rscript {params.tool} {input} -o {output.consol} -k {output.key} \ - --stat {output.stat} > {log} 2>&1 + --stat {output.stat} > {log} 2>&1 """ diff --git a/rules/demulti.rules b/rules/demulti.rules index 20e41f1e..9363f258 100644 --- a/rules/demulti.rules +++ b/rules/demulti.rules @@ -4,7 +4,7 @@ rule demultiplex: input: configFile=ancient("configs/" + RUN + ".config.yml"), - sampleInfo=config["Sample_Info"], + sampleInfo=ancient(config["Sample_Info"]), R1=str(Path(config["Seq_Path"]) / config["R1"]), R2=str(Path(config["Seq_Path"]) / config["R2"]), I1=str(Path(config["Seq_Path"]) / config["I1"]), @@ -19,8 +19,7 @@ rule demultiplex: RUN_DIR + "/process_data/degenerate.{type}.fastq.gz", type=TYPES)), unas=temp(expand( RUN_DIR + "/process_data/unassigned.{type}.fastq.gz", type=TYPES)), - stat=temp( - RUN_DIR + "/process_data/" + RUN + ".demulti.stat") + stat=temp(RUN_DIR + "/process_data/" + RUN + ".demulti.stat") params: tool=ROOT_DIR + "/tools/rscripts/demulti.R", bc1Len=config["barcode1Length"], diff --git a/rules/process.rules b/rules/process.rules index 046aa377..8deb9844 100644 --- a/rules/process.rules +++ b/rules/process.rules @@ -5,7 +5,7 @@ rule all_uniq_sites: input: expand(RUN_DIR + "/process_data/{sample}.uniq.csv", sample=SAMPLES) output: - RUN_DIR + "/output/unique_sites." + RUN + ".csv" + temp(RUN_DIR + "/output/unique_sites." + RUN + ".csv") params: RUN_DIR + "/process_data" resources: @@ -18,15 +18,6 @@ rule all_uniq_sites: done """ -rule compress_uniq_sites: - input: - sites=RUN_DIR + "/output/unique_sites." + RUN + ".csv", - edits=RUN_DIR + "/output/incorp_sites." + RUN + ".rds" - output: RUN_DIR + "/output/unique_sites." + RUN + ".csv.gz" - resources: - mem_mb = lambda wildcards, attempt: attempt * config["defaultMB"] - shell: "gzip {input.sites}" - def all_umitag_inputs(wildcards): if (config["UMItags"]): @@ -51,8 +42,8 @@ rule assimilate_sites: incorp=RUN_DIR + "/output/incorp_sites." + RUN + ".rds", stat=temp(RUN_DIR + "/process_data/" + RUN + ".assim.stat") params: - config = RUN_DIR + "/" + "config.yml", - tool = ROOT_DIR + "/tools/rscripts/assimilate_incorp_data.R" + config=RUN_DIR + "/" + "config.yml", + tool=ROOT_DIR + "/tools/rscripts/assimilate_incorp_data.R" log: RUN_DIR + "/logs/" + RUN + ".assim.log" resources: @@ -69,7 +60,9 @@ rule assimilate_sites: rule iguide_evaluation: input: RUN_DIR + "/output/incorp_sites." + RUN + ".rds" - output: temp(RUN_DIR + "/output/iguide.eval." + RUN + ".rds") + output: + eval=temp(RUN_DIR + "/output/iguide.eval." + RUN + ".rds"), + stat=temp(RUN_DIR + "/process_data/" + RUN + ".eval.stat") params: tool = ROOT_DIR + "/tools/rscripts/evaluate_incorp_data.R", config = RUN_DIR + "/" + "config.yml" @@ -77,10 +70,10 @@ rule iguide_evaluation: resources: mem_mb=lambda wildcards, attempt: attempt * config["evaluateMB"] run: - call_str="Rscript {params.tool} {params.config} -o {output}" + call_str="Rscript {params.tool} {params.config} -o {output.eval}" if (config["suppFile"]): call_str=call_str + " -s " + ROOT_DIR + "/" + config["Supplemental_Info"] - call_str=call_str + " > {log} 2>&1" + call_str=call_str + " --stat {output.stat} > {log} 2>&1" shell(call_str) @@ -111,6 +104,7 @@ rule run_report: Rscript {params.tool} {input} -o {output} {params.supp} > {log} 2>&1 """ + rule run_summary: input: RUN_DIR + "/output/iguide.eval." + RUN + ".rds" output: RUN_DIR + "/reports/summary." + RUN + ".txt" diff --git a/rules/quality.blat.rules b/rules/quality.blat.rules index bb7394f8..71368c83 100644 --- a/rules/quality.blat.rules +++ b/rules/quality.blat.rules @@ -9,8 +9,8 @@ rule post_align: keyR2=RUN_DIR + "/process_data/{sample}.R2.key.csv" output: uniq=temp(RUN_DIR + "/process_data/{sample}.uniq.csv"), - chimera=RUN_DIR + "/process_data/{sample}.chimera.rds", - multihit=RUN_DIR + "/process_data/{sample}.multihits.rds", + chimera=temp(RUN_DIR + "/process_data/{sample}.chimera.rds"), + multihit=temp(RUN_DIR + "/process_data/{sample}.multihits.rds"), stat=temp(RUN_DIR + "/process_data/{sample}.align.stat") params: tool=ROOT_DIR + "/tools/rscripts/couple.R", diff --git a/rules/umitag.rules b/rules/umitag.rules index 0135fcca..d60d201f 100644 --- a/rules/umitag.rules +++ b/rules/umitag.rules @@ -6,7 +6,7 @@ rule collect_umitags: RUN_DIR + "/process_data/{sample}.I2.fastq.gz" output: seq=temp(RUN_DIR + "/process_data/{sample}.I2.trim.fastq.gz"), - umi=RUN_DIR + "/process_data/{sample}.umitags.fasta.gz", + umi=temp(RUN_DIR + "/process_data/{sample}.umitags.fasta.gz"), stat=temp(RUN_DIR + "/process_data/{sample}.umitags.stat") params: tool=ROOT_DIR + "/tools/rscripts/trim.R", diff --git a/tools/iguidelib/iguidelib/scripts/command.py b/tools/iguidelib/iguidelib/scripts/command.py index 0369fe9a..b5ffa587 100644 --- a/tools/iguidelib/iguidelib/scripts/command.py +++ b/tools/iguidelib/iguidelib/scripts/command.py @@ -20,7 +20,7 @@ def main(): " primary:\n" " setup \tCreate a new config file for a project using local data.\n" " run \tExecute the iGUIDE pipeline.\n\n" - " accessory:\n" + " auxiliary:\n" " eval \tEvaluate a set or sets of assimilated iGUIDE outputs.\n" " report \tGenerate a custom report from iGUIDE output files.\n" " summary \tGenerate a consise summary from iGUIDE output files.\n" diff --git a/tools/rscripts/assimilate_incorp_data.R b/tools/rscripts/assimilate_incorp_data.R index c8614245..04ddc74a 100644 --- a/tools/rscripts/assimilate_incorp_data.R +++ b/tools/rscripts/assimilate_incorp_data.R @@ -50,7 +50,7 @@ parser$add_argument( "-m", "--multihits", nargs = "+", type = "character", help = paste( "Path(s) to associated multihit files (.rds) as produced by coupling", - "BLAT output files. Multiple file paths can be separated by a space." + "alignment output files. Multiple file paths can be separated by a space." ) ) @@ -101,6 +101,17 @@ input_table <- input_table[ ), ] +## Remove output file(s) if existing +if( args$stat != FALSE ){ + output_files <- c(args$output, args$stat) +}else{ + output_files <- c(args$output) +} + +if( any(sapply(output_files, file.exists)) ){ + null <- lapply(output_files, unlink) +} + # Log inputs cat("\nAssimilate Inputs:\n") print( @@ -120,30 +131,17 @@ code_dir <- dirname(sub( ## Load in supporting functions for the analysis source(file.path(code_dir, "supporting_scripts/iguide_support.R")) -source(file.path(code_dir, "supporting_scripts/nucleotideScoringMatrices.R")) # Inputs and parameters ---- # Run parameters and sample parameters config <- yaml::yaml.load_file(args$config) -sample_info_path <- file.path(args$iguide_dir, config$Sample_Info) - -if( !(file.exists(sample_info_path) | file.exists(config$Sample_Info)) ){ - - stop( - "\n Specified Sample Info file not found: ", config$Sample_Info, "\n" - ) - -}else if( !file.exists(sample_info_path) ){ - - sample_info_path <- file.path(config$Sample_Info) - -} - -sample_info <- data.table::fread(input = sample_info_path, data.table = FALSE) - -submat <- banmat() +## These parameters are dictate part of the following analysis if multihit +## alignments are to be considered in the analysis. +upstream_dist <- config$upstreamDist +downstream_dist <- config$downstreamDist +pile_up_min <- config$pileUpMin # Load reference genome ---- ## Load a reference genome from a fasta file or a BSGenome reference. @@ -215,184 +213,8 @@ if( grepl(".fa", config$Ref_Genome) ){ } -# Incorporation site parameters ---- -## These parameters are pulled straight from the run config file and describe -## how the following analysis will be conducted. - -upstream_dist <- config$upstreamDist -downstream_dist <- config$downstreamDist -max_target_mismatch <- config$maxTargetMismatch -pile_up_min <- config$pileUpMin -on_target_sites <- config$On_Target_Sites - -# Load data related to how samples were processed ---- -## The treatment object dictates how each sample was treated, or which guide -## RNAs where used on which samples. This is important in the results -## interpretation. - -treatment <- config$Treatment - -if( any(grepl("sampleInfo:", treatment[1])) ){ - - info_col <- match( - x = stringr::str_extract(string = treatment[1], pattern = "[\\w]+$"), - table = names(sample_info) - ) - - if( length(info_col) != 1 ){ - stop("\n Cannot parse treatment data. Check config yaml and sampleInfo.\n") - } - - treatment_df <- data.frame( - sampleName = sample_info$sampleName, - treatment = sample_info[,info_col] - ) - - treatment_df$specimen <- stringr::str_extract( - string = treatment_df$sampleName, pattern = "[\\w]+" - ) - - treatment_df <- unique(treatment_df[,c("specimen", "treatment")]) - treatment <- strsplit(as.character(treatment_df$treatment), ";") - names(treatment) <- treatment_df$specimen - -}else if( any(grepl("all", names(treatment))) ){ - - treatment_df <- data.frame( - sampleName = sample_info$sampleName, - treatment = unique(unlist(treatment)) - ) - - treatment_df$specimen <- stringr::str_extract( - string = treatment_df$sampleName, - pattern = "[\\w]+" - ) - - treatment_df <- unique(treatment_df[,c("specimen", "treatment")]) - treatment <- strsplit(as.character(treatment_df$treatment), ";") - names(treatment) <- treatment_df$specimen - -}else{ - - treatment_df <- data.frame( - "specimen" = names(treatment), - "treatment" = sapply(treatment, paste, collapse = ";") - ) - -} - - -## Identify the treatment of nucleases as well - -nuclease <- config$Nuclease - -if( any(grepl("sampleInfo:", nuclease[1])) ){ - - info_col <- match( - x = stringr::str_extract(string = nuclease[1], pattern = "[\\w]+$"), - table = names(sample_info) - ) - - if( length(info_col) != 1 ){ - stop("\n Cannot parse nuclease data. Check config yaml and sampleInfo.\n") - } - - nuclease_df <- data.frame( - sampleName = sample_info$sampleName, - nuclease = sample_info[,info_col] - ) - - nuclease_df$specimen <- stringr::str_extract( - string = nuclease_df$sampleName, pattern = "[\\w]+" - ) - - nuclease_df <- unique(nuclease_df[,c("specimen", "nuclease")]) - nuclease <- strsplit(as.character(nuclease_df$nuclease), ";") - names(nuclease) <- nuclease_df$specimen - -}else if( any(grepl("all", names(nuclease))) ){ - - nuclease_df <- data.frame( - sampleName = sample_info$sampleName, - nuclease = unique(unlist(nuclease)) - ) - - nuclease_df$specimen <- stringr::str_extract( - string = nuclease_df$sampleName, - pattern = "[\\w]+" - ) - - nuclease_df <- unique(nuclease_df[,c("specimen", "nuclease")]) - nuclease <- strsplit(as.character(nuclease_df$nuclease), ";") - names(nuclease) <- nuclease_df$specimen - -}else{ - - nuclease_df <- data.frame( - "specimen" = names(nuclease), - "nuclease" = sapply(nuclease, paste, collapse = ";") - ) - -} - -nuclease_treaments <- dplyr::left_join( - nuclease_df, treatment_df, by = "specimen" -) - -target_combn <- structure( - strsplit(nuclease_treaments$treatment, ";"), - names = nuclease_treaments$specimen -) - -combn_tbl <- data.frame( - nuclease = nuclease_treaments$nuclease[ - as.vector(match( - S4Vectors::Rle(names(target_combn), lengths(target_combn)), - nuclease_treaments$specimen - )) - ], - target = unlist(target_combn), - row.names = NULL - ) %>% - dplyr::filter(target != "Mock") %>% - dplyr::distinct() - - -# Load target sequences and sample metadata ---- -## Identify the target sequences used for the analysis and build an object -## use further on in processing to analyse the samples. - -target_seqs <- lapply(config$Target_Sequences, toupper) - -pam_seq <- lapply( - unique(unlist(nuclease)), - function(x) toupper(config$Nuclease_Profiles[[x]]$PAM) -) - -names(pam_seq) <- unique(unlist(nuclease)) - -combn_tbl <- combn_tbl %>% - dplyr::mutate( - sequence = target_seqs[target], - PAM = pam_seq[nuclease] - ) - -# Log combination treatment table -cat("\nTarget Sequence Table:\n") -print(combn_tbl, right = FALSE, row.names = FALSE) - - -# Log treatment table -cat("\nSpecimen Target Treatment:\n") -null <- lapply(seq_along(treatment), function(i){ - cat(" ", names(treatment)[i], ":\n", sep = "") - cat(" nuclease : ", nuclease[[i]], "\n", sep = "") - cat(" targets : ", paste(treatment[[i]], collapse = ", "), "\n", sep = "") -}) - - # Load input data ---- -# Unique sites ---- +## Unique sites ---- ## This object is the alignment positions for the sequences / reads that only ## aligned to a single location on the reference genome. @@ -524,453 +346,21 @@ if( all(!is.null(args$umitags)) ){ } -# Process input data ---- -# Format input alignments ---- -## All alignments - -algnmts <- dplyr::mutate( - reads, - specimen = stringr::str_extract(sampleName, "[\\w]+") -) - -## Determine abundance metrics, with or without UMItags -if( config$UMItags & !is.null(args$umitags) ){ - - algnmts <- dplyr::arrange(algnmts, desc(contrib)) %>% - dplyr::group_by(seqnames, start, end, strand, specimen, sampleName) %>% - dplyr::summarise( - count = sum(contrib), - umitag = sum(as.integer(!duplicated(umitag[!is.na(umitag)])) * contrib), - contrib = max(contrib) - ) %>% - dplyr::ungroup() %>% - dplyr::select( - seqnames, start, end, strand, specimen, - sampleName, count, umitag, contrib - ) %>% - as.data.frame() - -}else{ - - algnmts <- dplyr::group_by( - algnmts, seqnames, start, end, strand, specimen, sampleName - ) %>% - dplyr::summarize(count = sum(contrib), contrib = max(contrib)) %>% - dplyr::ungroup() %>% - dplyr::select( - seqnames, start, end, strand, specimen, sampleName, count, contrib - ) %>% - as.data.frame() - -} - -## Generate a sample table of the data for log purposes -sample_index <- ifelse(nrow(algnmts) > 10, 10, nrow(algnmts)) -sample_index <- sample(seq_len(nrow(algnmts)), sample_index, replace = FALSE) - -cat("\nSample of aligned templates:\n") - -print( - data.frame(algnmts[sample_index,]), - right = FALSE, - row.names = FALSE -) - -cat(paste0("\nNumber of templates: ", nrow(algnmts), "\n")) - -rm(sample_index) - -## Transform the data into a GRanges object -algnmts_gr <- GenomicRanges::GRanges( - seqnames = algnmts$seqnames, - ranges = IRanges::IRanges(start = algnmts$start, end = algnmts$end), - strand = algnmts$strand, - seqinfo = GenomeInfoDb::seqinfo(ref_genome) -) - -if( config$UMItags & !is.null(args$umitags) ){ - - GenomicRanges::mcols(algnmts_gr) <- dplyr::select( - algnmts, specimen, sampleName, count, umitag, contrib - ) - -}else{ - - GenomicRanges::mcols(algnmts_gr) <- dplyr::select( - algnmts, specimen, sampleName, count, contrib - ) - -} - -# Analyze alignments ---- -## Identify groups of alignments or pileups of aligned fragments -## These pileups give strong experimental evidence of directed incorporation of -## the dsODN into a region. Initially, pileups are identified and then checked -## for pairing, or if there is another pileup on the opposite strand in close -## proximity. -algnmts_gr$clus.ori <- pileupCluster( - gr = algnmts_gr, - grouping = "specimen", - maxgap = 0L, - return = "simple" -) - -algnmts_gr$paired.algn <- identifyPairedAlgnmts( - gr = algnmts_gr, - grouping = "specimen", - maxgap = upstream_dist*2 -) - -algnmts_grl <- split(algnmts_gr, unlist(nuclease)[algnmts_gr$specimen]) - -annot_clust_info <- dplyr::bind_rows(lapply( - seq_along(algnmts_grl), - function(i, grl){ - - gr <- grl[[i]] - nuc <- names(grl)[i] - - if( !nuc %in% names(config$Nuclease_Profiles) ){ - nuc_profile <- NULL - }else{ - nuc_profile <- config$Nuclease_Profiles[[nuc]] - } - - ## Create a GRange with only the unique cluster origins - split_clus_id <- stringr::str_split( - string = unique(paste0(gr$specimen, ":", gr$clus.ori)), - pattern = ":", - simplify = TRUE - ) - - algn_clusters <- GenomicRanges::GRanges( - seqnames = split_clus_id[,2], - ranges = IRanges::IRanges( - start = as.numeric(split_clus_id[,4]), width = 1 - ), - strand = split_clus_id[,3], - seqinfo = GenomeInfoDb::seqinfo(ref_genome) - ) - - algn_clusters$specimen <- split_clus_id[,1] - algn_clusters$clus.ori <- vcollapse(split_clus_id[, 2:4], sep = ":") - - algn_clusters$clus.seq <- getSiteSeqs( - gr = algn_clusters, - upstream.flank = upstream_dist, - downstream.flank = downstream_dist, - ref.genome = ref_genome - ) - - ## Identify which target sequences binding near clusters - if( !is.null(nuc_profile) ){ - - algn_clusters <- compareTargetSeqs( - gr.with.sequences = algn_clusters, - seq.col = "clus.seq", - target.seqs = target_seqs, - tolerance = max_target_mismatch, - nuc.profile = nuc_profile, - submat = submat, - upstream.flank = upstream_dist, - downstream.flank = downstream_dist - ) - - }else{ - - algn_clusters$target.match <- "No_valid_match" - algn_clusters$target.mismatch <- NA - algn_clusters$target.score <- NA - algn_clusters$aligned.sequence <- NA - algn_clusters$edit.site <- NA - - } - - as.data.frame(GenomicRanges::mcols(algn_clusters)) - - }, - grl = algnmts_grl -)) - - -## Merge the target sequence alignment information from the clusters back to all -## unique alignments -algnmts <- as.data.frame(merge( - x = as.data.frame(algnmts_gr), - y = dplyr::select(annot_clust_info, -clus.seq), - by = c("specimen", "clus.ori") -)) - -## Change guideRNA.match to No_Valid_Match if an inappropriate gRNA is annotated -algnmts$target.match <- filterInappropriateComparisons( - guideRNA.match = algnmts$target.match, - specimen = algnmts$specimen, - treatment = treatment -) - -## Fragment pileups, paired clustering, and guideRNA alignments have been used -## to characterize the incorporation sites analyzed here. Each metric will be -## used to create a list of incorporation sites that may be nuclease cut sites. -## The following identifies which alignments are associated with each of these -## criteria. -tbl_clus_ori <- algnmts %>% - dplyr::group_by(specimen, clus.ori) %>% - dplyr::filter(n() >= pile_up_min) %>% - dplyr::ungroup() %$% - table(clus.ori) - -idx_clus_ori <- which(algnmts$clus.ori %in% names(tbl_clus_ori)) - -tbl_paired_algn <- algnmts %>% - dplyr::filter(!is.na(paired.algn)) %$% - table(paired.algn) - -idx_paired_algn <- which(algnmts$paired.algn %in% names(tbl_paired_algn)) - -idx_matched <- which(algnmts$target.match != "No_valid_match") - -idx_combined <- sort(unique(c(idx_clus_ori, idx_paired_algn, idx_matched))) - -idx_df <- data.frame( - "Type" = c("PileUp", "Paired", "gRNA_Matched", "Combined"), - "Counts" = sapply( - list(idx_clus_ori, idx_paired_algn, idx_matched, idx_combined), - length - ) -) - -cat("\nTable of uniquely aligned template counts:\n") -print(idx_df, right = FALSE, row.names = FALSE) -cat(paste0("\nTotal number of templates: ", nrow(algnmts), "\n")) - -probable_algns <- algnmts[idx_combined,] - -probable_algns$on.off.target <- ifelse( - probable_algns$edit.site %in% expandPosStr(unlist(on_target_sites)), - "On-target", - "Off-target" -) - -cat("\nOn / Off target alignment counts:\n") -print(table(probable_algns$on.off.target)) - - -## Create summary and output formated object related to each of the criteria for -## edited site detection. - -## Matched alignments -matched_algns <- probable_algns[ - probable_algns$target.match != "No_valid_match", -] - -matched_summary <- matched_algns %>% - dplyr::mutate( - target.match = stringr::str_replace( - string = target.match, - pattern = "\\:\\([\\w]+\\)$", - replacement = "" - ) - ) %>% - dplyr::group_by( - specimen, edit.site, aligned.sequence, target.match, target.mismatch - ) - -if( config$UMItags & !is.null(args$umitags) ){ - - matched_summary <- dplyr::summarise( - matched_summary, - on.off.target = paste(sort(unique(on.off.target)), collapse = ";"), - paired.algn = paste(sort(unique(paired.algn)), collapse = ";"), - count = sum(count), - umitag = sum(umitag), - algns = sum(contrib), - orient = paste(sort(unique(as.character(strand))), collapse = ";") - ) - -}else{ - - matched_summary <- dplyr::summarise( - matched_summary, - on.off.target = paste(sort(unique(on.off.target)), collapse = ";"), - paired.algn = paste(sort(unique(paired.algn)), collapse = ";"), - count = sum(count), - algns = sum(contrib), - orient = paste(sort(unique(as.character(strand))), collapse = ";") - ) - -} - -matched_summary <- dplyr::ungroup(matched_summary) %>% - dplyr::arrange(specimen, target.match, desc(algns)) %>% - as.data.frame() - -## Paired alignments -paired_algns <- probable_algns[ - probable_algns$paired.algn %in% names(tbl_paired_algn), -] - -paired_regions <- paired_algns %>% - dplyr::group_by(specimen, paired.algn, strand) %>% - dplyr::mutate(pos = ifelse(strand == "+", min(start), max(end))) %>% - dplyr::group_by(specimen, paired.algn) - -if( config$UMItags & !is.null(args$umitags) ){ - - paired_regions <- dplyr::summarise( - paired_regions, - seqnames = unique(seqnames), - start = min(pos), - end = max(pos), - mid = start + (end-start)/2, - strand = "*", - width = end - start, - count = sum(count), - umitag = sum(umitag), - algns = sum(contrib) - ) %>% - dplyr::ungroup() - -}else{ - - paired_regions <- dplyr::summarise( - paired_regions, - seqnames = unique(seqnames), - start = min(pos), - end = max(pos), - mid = start + (end-start)/2, - strand = "*", - width = end - start, - count = sum(count), - algns = sum(contrib) - ) %>% - dplyr::ungroup() - -} - -if( nrow(paired_regions) > 0 ){ - - paired_regions <- paired_regions %>% - dplyr::group_by(specimen, paired.algn) %>% - dplyr::mutate( - on.off.target = ifelse( - any(sapply( - expandPosStr(unlist(on_target_sites[ - which( - stringr::str_extract( - names(on_target_sites), "[\\w\\-\\_\\.]+") %in% - treatment[[specimen]] - ) - ])), - function(x, seq, st, en){ - - match_seq <- seq == stringr::str_extract(x, "[\\w]+") - - within_start <- st <= - as.numeric(stringr::str_extract(x, "[\\w]+$")) + downstream_dist - - within_end <- en >= - as.numeric(stringr::str_extract(x, "[\\w]+$")) - downstream_dist - - match_seq & within_start & within_end - - }, - seq = seqnames, - st = start, - en = end - )), - "On-target", - "Off-target" - ) - ) %>% - dplyr::ungroup() %>% - as.data.frame() - -}else{ - - paired_regions <- dplyr::mutate( - paired_regions, - on.off.target = vector(mode = "character") - ) - -} - -## Pile up alignments -pile_up_algns <- probable_algns[ - probable_algns$clus.ori %in% names(tbl_clus_ori), -] - -pile_up_summary <- pile_up_algns %>% - dplyr::mutate( - target.match = stringr::str_replace( - string = target.match, - pattern = "\\:\\([\\w]+\\)$", - replacement = "" - ) - ) %>% - dplyr::group_by(specimen, clus.ori) - -if( config$UMItags & !is.null(args$umitags) ){ - - pile_up_summary <- dplyr::summarise( - pile_up_summary, - on.off.target = paste(sort(unique(on.off.target)), collapse = ";"), - paired.algn = paste(sort(unique(paired.algn)), collapse = ";"), - count = sum(count), - umitag = sum(umitag), - algns = sum(contrib) - ) - -}else{ - - pile_up_summary <- dplyr::summarise( - pile_up_summary, - on.off.target = paste(sort(unique(on.off.target)), collapse = ";"), - paired.algn = paste(sort(unique(paired.algn)), collapse = ";"), - count = sum(count), - algns = sum(contrib) - ) - -} - -pile_up_summary <- dplyr::ungroup(pile_up_summary) %>% - dplyr::arrange(specimen, desc(algns)) %>% - as.data.frame() - - # Generate stats if requested ---- ## If requested, generate stats from the analysis for qc. if( args$stat != FALSE ){ - - stat_summary <- function(x, y){ - - x %>% - dplyr::mutate(metric = y) %>% - dplyr::group_by(sampleName, metric) %>% - dplyr::summarize(count = sum(contrib)) %>% - dplyr::ungroup() - - } - - total_stat <- stat_summary(algnmts, "total.algns") - combined_stat <- stat_summary(probable_algns, "combined.algns") - pileup_stat <- stat_summary(pile_up_algns, "pileup.algns") - paired_stat <- stat_summary(paired_algns, "paired.algns") - matched_stat <- stat_summary(matched_algns, "matched.algns") - - on_tar_stat <- dplyr::filter( - matched_algns, on.off.target == "On-target" - ) %>% - stat_summary("ontarget.algns") - - off_tar_stat <- dplyr::filter( - matched_algns, on.off.target == "Off-target" - ) %>% - stat_summary("offtarget.algns") - - stat <- dplyr::bind_rows( - total_stat, combined_stat, pileup_stat, paired_stat, - matched_stat, on_tar_stat, off_tar_stat) + + stat <- reads %>% + dplyr::group_by(sampleName) %>% + dplyr::summarise( + reads = dplyr::n_distinct(ID), + aligns = dplyr::n_distinct(seqnames, start, end, strand), + loci = dplyr::n_distinct( + seqnames, strand, ifelse(strand == "+", start, end) + ) + ) %>% + tidyr::gather(key = "type", value = "value", -sampleName) write.table( x = stat, file = args$stat, @@ -980,23 +370,14 @@ if( args$stat != FALSE ){ } -# Output data composition ---- -## rds file that can be read into reports or loaded into a database with some -## additional scripting. -data_comp <- list( - "algnmts" = algnmts, - "probable_algns" = probable_algns, - "matched_algns" = matched_algns, - "matched_summary" = matched_summary, - "paired_algns" = paired_algns, - "paired_regions" = paired_regions, - "pile_up_algns" = pile_up_algns, - "pile_up_summary" = pile_up_summary -) - -saveRDS(data_comp, file = args$output) +# Output data ---- +## rds file that can be read into evaluation or reports or loaded into a +## database with some additional scripting. +reads %>% + dplyr::select(-lociPairKey, -readPairKey) %>% + saveRDS(file = args$output) -if( file.exists(args$output) ){ +if( all(sapply(output_files, file.exists)) ){ message("Successfully completed script.") }else{ stop("Check output, it not detected after assimilating.") diff --git a/tools/rscripts/check_test_accuracy.R b/tools/rscripts/check_test_accuracy.R index 24927b84..4138accc 100644 --- a/tools/rscripts/check_test_accuracy.R +++ b/tools/rscripts/check_test_accuracy.R @@ -162,8 +162,8 @@ sample_info <- readFile(run_config$Sample_Info, args$iguide_dir) # Files to check ---- check_files <- paste0( - "analysis/", run_config$Run_Name, "/output/unique_sites.", - run_config$Run_Name, ".csv.gz" + "analysis/", run_config$Run_Name, "/output/incorp_sites.", + run_config$Run_Name, ".rds" ) check_data <- lapply(check_files, readFile, root = args$iguide_dir) diff --git a/tools/rscripts/collect_stats.R b/tools/rscripts/collect_stats.R index edc56016..d6e48090 100644 --- a/tools/rscripts/collect_stats.R +++ b/tools/rscripts/collect_stats.R @@ -9,7 +9,8 @@ parser <- argparse::ArgumentParser( ) parser$add_argument( - "dir", nargs = 1, type = "character", help = "Directory with *.stat files." + "files", nargs = "+", type = "character", + help = "Paths to stat containing files (long, csv format). " ) parser$add_argument( @@ -20,19 +21,27 @@ parser$add_argument( args <- parser$parse_args(commandArgs(trailingOnly = TRUE)) # Manipulate file paths to determine stat types -all_files <- list.files(path = args$dir, pattern = "*.stat") -file_names <- stringr::str_extract(all_files, "[\\w\\.\\-\\_]+$") +files_present <- sapply(args$files, file.exists) + +if( !all(files_present) ){ + stop( + "\n Cannot find the following files: ", + paste(args$files[!files_present], collapse = "\n ") + ) +} + +file_names <- stringr::str_extract(args$files, "[\\w\\.\\-\\_]+$") file_types <- sub("[\\w\\-\\_]+.", "", file_names, perl = TRUE) file_types <- sub(".stat", "", file_types) # Read in data in a long format long_data <- dplyr::bind_rows( lapply( - structure(all_files, names = file_types), - function(file, dir){ + structure(args$files, names = file_types), + function(file){ x <- try( - expr = read.csv(file = file.path(dir, file), header = FALSE), + expr = read.csv(file = file, header = FALSE), silent = TRUE ) @@ -55,10 +64,10 @@ long_data <- dplyr::bind_rows( } - }, - dir = args$dir + } ), - .id = "type") + .id = "type" +) # Transform data into a wide format wide_data <- dplyr::mutate( diff --git a/tools/rscripts/couple.R b/tools/rscripts/couple.R index e3e27555..aec4a077 100644 --- a/tools/rscripts/couple.R +++ b/tools/rscripts/couple.R @@ -7,7 +7,7 @@ #' Names with ".": arguments / options for functions #!/usr/bin/env Rscript -options(stringsAsFactors = FALSE, scipen = 99, width = 999) +options(stringsAsFactors = FALSE, scipen = 99, width = 120) code_dir <- dirname(sub( pattern = "--file=", @@ -298,7 +298,7 @@ if( length(args$keys) > 1 ){ # Print beginning of keys printHead( keys, - title = "Beginning of Key for relating reads to sequences.", + title = "Beginning of Key for relating reads to sequences", caption = paste0( "\tReads: ", length(unique(keys$readNames)), "\n\tUnique Pairings: ", length(unique(keys$readPairKey)) @@ -338,8 +338,8 @@ if( length(args$keys) > 1 ){ keys, title = "Beginning of Key for relating reads to sequences.", caption = paste0( - "\tReads: ", length(unique(keys$readNames)), - "\n\tUnique Pairings: ", length(unique(keys$readPairKey)) + "\n Reads : ", format(length(unique(keys$readNames)), big.mark = ","), + "\n Unique Pairings: ", format(length(unique(keys$readPairKey)), big.mark = ",") ) ) @@ -395,10 +395,10 @@ if( is.null(args$keys) ){ # Print beginning of keys printHead( keys, - title = "Beginning of Key for relating reads to sequences.", + title = "Beginning of Key for relating reads to sequences", caption = paste0( - "\tReads: ", length(unique(keys$readNames)), - "\n\tUnique Pairings: ", length(unique(keys$readPairKey)) + "\n Reads :", format(length(unique(keys$readNames)), big.mark = ","), + "\n Unique Pairings:", format(length(unique(keys$readPairKey)), big.mark = ",") ) ) @@ -475,7 +475,7 @@ adrift_hits$adriftKey <- match(adrift_hits$qName, levels(keys$adriftSeqID)) # Info after quality filtering individual alignments. printHead( anchor_hits, - title = "Head of filtered anchor alignments.", + title = "Head of filtered anchor alignments", caption = sprintf( "Alignments: %1$s from %2$s reads", length(anchor_hits), @@ -485,7 +485,7 @@ printHead( printHead( adrift_hits, - title = "Head of filtered adrift alignments.", + title = "Head of filtered adrift alignments", caption = sprintf( "Alignments: %1$s from %2$s reads", length(adrift_hits), @@ -543,9 +543,6 @@ pairs <- GenomicRanges::findOverlaps( ignore.strand = TRUE ) -anchor_loci <- red_anchor_hits[S4Vectors::queryHits(pairs)] -adrift_loci <- red_adrift_hits[S4Vectors::subjectHits(pairs)] - #Stop if no alignments coupled based on criteria. if( length(pairs) == 0 ){ @@ -556,11 +553,21 @@ if( length(pairs) == 0 ){ } # Check isDownstream and isOppositeStrand -adrift_loci_starts <- GenomicRanges::start(adrift_loci) -anchor_loci_starts <- GenomicRanges::start(anchor_loci) +adrift_loci_starts <- GenomicRanges::start(red_adrift_hits)[ + S4Vectors::subjectHits(pairs) +] -adrift_loci_strand <- GenomicRanges::strand(adrift_loci) -anchor_loci_strand <- GenomicRanges::strand(anchor_loci) +anchor_loci_starts <- GenomicRanges::start(red_anchor_hits)[ + S4Vectors::queryHits(pairs) +] + +adrift_loci_strand <- GenomicRanges::strand(red_adrift_hits)[ + S4Vectors::subjectHits(pairs) +] + +anchor_loci_strand <- GenomicRanges::strand(red_anchor_hits)[ + S4Vectors::queryHits(pairs) +] keep_loci <- ifelse( anchor_loci_strand == "+", @@ -578,11 +585,10 @@ keep_loci <- as.vector( (keep_loci & anchor_loci_strand != "*") & (adrift_loci_strand != "*") ) -anchor_loci <- anchor_loci[keep_loci] -adrift_loci <- adrift_loci[keep_loci] +pairs <- pairs[keep_loci] # Stop if no loci were properly paired -if( length(anchor_loci) == 0 | length(adrift_loci) == 0 ){ +if( length(pairs) == 0 ){ cat("\nNo genomic loci from alignments were properly paired.\n") writeNullOutput(args) @@ -599,172 +605,160 @@ if( length(anchor_loci) == 0 | length(adrift_loci) == 0 ){ #' work as anticipated with IRanges, and therefore objects were moved to GRanges #' and GRangesLists. loci_key <- data.frame( - "anchorLoci" = S4Vectors::queryHits(pairs)[keep_loci], - "adriftLoci" = S4Vectors::subjectHits(pairs)[keep_loci] + "anchorLoci" = S4Vectors::queryHits(pairs), + "adriftLoci" = S4Vectors::subjectHits(pairs) ) loci_key$lociPairKey <- paste0(loci_key$anchorLoci, ":", loci_key$adriftLoci) -loci_key$anchorKey <- IRanges::IntegerList( - split( - anchor_hits$anchorKey[unlist(anchor_loci$revmap)], - S4Vectors::Rle( - values = seq_along(anchor_loci), - lengths = width(anchor_loci$revmap@partitioning) - ) - ) -) +# Append *Loci ids to the anchor and adrift alignments +idx_passing_anchors <- unlist(red_anchor_hits$revmap[ + unique(loci_key$anchorLoci) +]) -loci_key$adriftKey <- IRanges::IntegerList( - split( - adrift_hits$adriftKey[unlist(adrift_loci$revmap)], - S4Vectors::Rle( - values = seq_along(adrift_loci), - lengths = width(adrift_loci$revmap@partitioning) - ) - ) -) +anchor_hits$anchorLoci <- NA +anchor_hits$anchorLoci[idx_passing_anchors] <- as.numeric(S4Vectors::Rle( + values = unique(loci_key$anchorLoci), + lengths = lengths(red_anchor_hits$revmap[unique(loci_key$anchorLoci)]) +)) -anchor_readPair_hits <- GenomicRanges::findOverlaps( - query = split( - x = GenomicRanges::GRanges( - seqnames = "int", - ranges = IRanges::IRanges(start = unlist(loci_key$anchorKey), width = 1), - strand = "*" - ), - f = S4Vectors::Rle( - values = seq_len(nrow(loci_key)), - lengths = S4Vectors::width(loci_key$anchorKey@partitioning) - ) - ), - subject = GenomicRanges::GRanges( - seqnames = "int", - ranges = IRanges::IRanges(start = unique_key_pairs$anchorKey, width = 1), - strand = "*") -) +idx_passing_adrifts <- unlist(red_adrift_hits$revmap[ + unique(loci_key$adriftLoci) +]) -loci_key$anchorReadPairs <- IRanges::IntegerList( - split( - x = S4Vectors::subjectHits(anchor_readPair_hits), - f = S4Vectors::queryHits(anchor_readPair_hits) - ) -) +adrift_hits$adriftLoci <- NA +adrift_hits$adriftLoci[idx_passing_adrifts] <- as.numeric(S4Vectors::Rle( + values = unique(loci_key$adriftLoci), + lengths = lengths(red_adrift_hits$revmap[unique(loci_key$adriftLoci)]) +)) -adrift_readPair_hits <- GenomicRanges::findOverlaps( - query = split( - x = GenomicRanges::GRanges( - seqnames = "int", - ranges = IRanges::IRanges(start = unlist(loci_key$adriftKey), width = 1), - strand = "*" - ), - f = S4Vectors::Rle( - values = seq_len(nrow(loci_key)), - lengths = width(loci_key$adriftKey@partitioning) - ) - ), - subject = GenomicRanges::GRanges( - seqnames = "int", - ranges = IRanges::IRanges(start = unique_key_pairs$adriftKey, width = 1), - strand = "*") +# Join the loci idx information up to the keys file +# Identify aligning keys +aligned_anchor_keys <- unique( + anchor_hits$anchorKey[!is.na(anchor_hits$anchorLoci)] ) -loci_key$adriftReadPairs <- IRanges::IntegerList( - split( - x = S4Vectors::subjectHits(adrift_readPair_hits), - f = S4Vectors::queryHits(adrift_readPair_hits) - ) +aligned_adrift_keys <- unique( + adrift_hits$adriftKey[!is.na(adrift_hits$adriftLoci)] ) -#' Determine intersecting readPairs between the anchor and adrift loci. -anchor_readPair_indices <- GenomicRanges::GRanges( - seqnames = S4Vectors::queryHits(anchor_readPair_hits), - ranges = IRanges::IRanges( - start = S4Vectors::subjectHits(anchor_readPair_hits), width = 1 - ), - strand = "*" +# Construct an anchor/adrift key to loci IntegerList with indices +anchor_key_to_loci <- with( + as.data.frame(anchor_hits)[ + anchor_hits$anchorKey %in% aligned_anchor_keys & + !is.na(anchor_hits$anchorLoci), + c("anchorKey", "anchorLoci") + ], + IRanges::IntegerList(split(anchorLoci, anchorKey)) ) -adrift_readPair_indices <- GenomicRanges::GRanges( - seqnames = S4Vectors::queryHits(adrift_readPair_hits), - ranges = IRanges::IRanges( - start = S4Vectors::subjectHits(adrift_readPair_hits), width = 1 - ), - strand = "*" +adrift_key_to_loci <- with( + as.data.frame(adrift_hits)[ + adrift_hits$adriftKey %in% aligned_adrift_keys & + !is.na(adrift_hits$adriftLoci), + c("adriftKey", "adriftLoci") + ], + IRanges::IntegerList(split(adriftLoci, adriftKey)) ) -intersecting_readPairs <- GenomicRanges::findOverlaps( - query = anchor_readPair_indices, - subject = adrift_readPair_indices -) +# Construct readPairKey to lociKey object +unique_read_pair_keys <- unique(keys$readPairKey) -#' Using the range information from the filtered paired alignments, the code -#' constructs a GRanges object from the anchor_loci and adrift_loci. anchor_loci -#' are the integration site positions while the adrift_loci are the various -#' breakpoints. The strand of the range is set to the same strand as the -#' anchor_loci since the direction of sequencing from the viral or vector genome -#' is from the U5-host junction found at the 3' end of the integrated element. -paired_loci_key <- loci_key[ - as.integer(unique(GenomicRanges::seqnames( - anchor_readPair_indices[S4Vectors::queryHits(intersecting_readPairs)] - ))), +unique_read_pair_keys <- unique_read_pair_keys[ + stringr::str_extract(unique_read_pair_keys, "[\\d]+") %in% names(anchor_key_to_loci) & + stringr::str_extract(unique_read_pair_keys, "[\\d]+$") %in% names(adrift_key_to_loci) ] -paired_loci_key$readPairKeys <- IRanges::CharacterList( - split( - x = unique_key_pairs$readPairKey[ - GenomicRanges::start(anchor_readPair_indices[ - S4Vectors::queryHits(intersecting_readPairs) - ]) - ], - f = as.integer(GenomicRanges::seqnames( - anchor_readPair_indices[S4Vectors::queryHits(intersecting_readPairs)] - )) +loci_key_anchor_idx <- IRanges::IntegerList(split( + seq_along(loci_key$anchorLoci), loci_key$anchorLoci +)) + +loci_key_adrift_idx <- IRanges::IntegerList(split( + seq_along(loci_key$adriftLoci), loci_key$adriftLoci +)) + +# Time sink -- warning +rpk_anchor_loci_idx <- IRanges::IntegerList(lapply( + anchor_key_to_loci[stringr::str_extract(unique_read_pair_keys, "[\\d]+")], + function(x) unlist(loci_key_anchor_idx[as.character(x)], use.names = FALSE) +)) + +# Time sink -- warning +rpk_adrift_loci_idx <- IRanges::IntegerList(lapply( + adrift_key_to_loci[stringr::str_extract(unique_read_pair_keys, "[\\d]+$")], + function(x) unlist(loci_key_adrift_idx[as.character(x)], use.names = FALSE) +)) + +rpk_loci_idx <- IRanges::intersect(rpk_anchor_loci_idx, rpk_adrift_loci_idx) +names(rpk_loci_idx) <- unique_read_pair_keys + +rpk_loci_key <- IRanges::CharacterList(split( + loci_key$lociPairKey[unlist(rpk_loci_idx)], S4Vectors::Rle( + values = names(rpk_loci_idx), lengths = lengths(rpk_loci_idx) ) -) +)) -select_anchor_loci <- anchor_loci[ - as.integer(unique(GenomicRanges::seqnames( - anchor_readPair_indices[S4Vectors::queryHits(intersecting_readPairs)] - ))) -] +gc() -select_adrift_loci <- adrift_loci[ - as.integer(unique(GenomicRanges::seqnames( - adrift_readPair_indices[S4Vectors::subjectHits(intersecting_readPairs)] - ))) +# Group readPairKeys into unique, mulithit, or artifactual chimeras +unique_rpks <- names(rpk_loci_key)[lengths(rpk_loci_key) == 1] +multihit_rpks <- names(rpk_loci_key)[lengths(rpk_loci_key) > 1] +chimera_rpks <- keys$readPairKey[ + !keys$readPairKey %in% c(unique_rpks, multihit_rpks) ] -paired_loci <- GenomicRanges::GRanges( - seqnames = GenomicRanges::seqnames(select_anchor_loci), +cat( + "\nUnique sequences associated with types of alignments:\n", + " unique alignments : ", format(length(unique_rpks), big.mark = ","), "\n", + " multihit alignments: ", format(length(multihit_rpks), big.mark = ","), "\n", + " chimera artifacts : ", format(length(chimera_rpks), big.mark = ","), "\n" +) + +# Couple together the anchor and adrift loci for expanding rpks-loci +# Using the range information from the filtered paired alignments, the code +# constructs a GRanges object from the anchor_loci and adrift_loci. Anchor_loci +# are the integration site positions while the adrift_loci are the various +# breakpoints. The strand of the range is set to the same strand as the +# anchor_loci since the direction of sequencing is considered to be from the +# host-junction found at the 3' end of the integrated element. + +coupled_loci <- GenomicRanges::GRanges( + seqnames = GenomicRanges::seqnames(red_anchor_hits)[loci_key$anchorLoci], ranges = IRanges::IRanges( start = ifelse( - GenomicRanges::strand(select_anchor_loci) == "+", - GenomicRanges::start(select_anchor_loci), - GenomicRanges::start(select_adrift_loci)), + GenomicRanges::strand(red_anchor_hits[loci_key$anchorLoci]) == "+", + GenomicRanges::start(red_anchor_hits)[loci_key$anchorLoci], + GenomicRanges::start(red_adrift_hits)[loci_key$adriftLoci] + ), end = ifelse( - GenomicRanges::strand(select_anchor_loci) == "+", - GenomicRanges::end(select_adrift_loci), - GenomicRanges::end(select_anchor_loci))), - strand = GenomicRanges::strand(select_anchor_loci), - lociPairKey = paired_loci_key$lociPairKey + GenomicRanges::strand(red_anchor_hits[loci_key$anchorLoci]) == "+", + GenomicRanges::start(red_adrift_hits)[loci_key$adriftLoci], + GenomicRanges::start(red_anchor_hits)[loci_key$anchorLoci] + ) + ), + strand = GenomicRanges::strand(red_anchor_hits[loci_key$anchorLoci]), + seqinfo = GenomeInfoDb::seqinfo(ref_genome), + lociPairKey = loci_key$lociPairKey ) -#' Information on valid paired alignments from all sequences present. +#' Information on valid coupled alignments from all sequences present. + printHead( - paired_loci, - title = "Head of valid paired loci present in the data.", - caption = sprintf("Genomic loci: %s", length(paired_loci)) + sort(coupled_loci[sample.int( + length(coupled_loci), + size = min(6, length(coupled_loci)), + replace = FALSE + )]), + title = "Randomly sampled coupled loci present in the data.", + caption = sprintf("Genomic loci: %s", length(coupled_loci)) ) -#' Add readPairKeys to the paired loci for expansion -paired_loci$readPairKeys <- paired_loci_key$readPairKeys - -#' Stop if there are no paired_loci -if( length(paired_loci) == 0 ){ +#' Stop if there are no coupled_loci +if( length(coupled_loci) == 0 ){ cat( - "\nNo valid paired genomic loci were found within", + "\nNo valid coupled genomic loci were found within", "the data given input criteria.\n" ) writeNullOutput(args) @@ -772,33 +766,16 @@ if( length(paired_loci) == 0 ){ } -#' Expand readPairKeys and lociPairKeys to make a single object that maps loci -#' to unique sequences. This is analogous to a sparse matrix, but in a -#' data.frame object. The keys object is still needed to jump from readPairKey -#' to readName. -read_loci_mat <- data.frame( - "lociPairKey" = S4Vectors::Rle( - values = paired_loci$lociPairKey, - lengths = S4Vectors::width(paired_loci$readPairKeys@partitioning)), - "readPairKey" = unlist(paired_loci$readPairKeys) -) - -#' Templates aligning to single loci are termed unique, while templates -#' aligning to multiple loci are termed multihits. -read_pair_counts <- table(read_loci_mat$readPairKey) -uniq_read_pairs <- names(read_pair_counts[read_pair_counts == 1]) -multihit_read_pairs <- names(read_pair_counts[read_pair_counts > 1]) - #' Bin reads that would map to different loci on the same read (chimeras) -#' All unique and multihit templates were mapped successfully to -#' genomic loci, yet some templates were sequenced but did not make it through -#' the selection criteria. These template either do not have alignments to the +#' All unique and multihit templates are mapped successfully to +#' genomic loci, yet some templates are sequenced but do not make it through +#' the selection criteria. These templates either do not have alignments to the #' reference genome (anchor or adrift did not align) or map to two distant #' genomic loci. The latter are termed chimeras and are considered to be #' artifacts of PCR amplification. if( !is.null(args$chimeras) ){ - failed_reads <- keys[!keys$readPairKey %in% read_loci_mat$readPairKey,] + failed_reads <- keys[keys$readPairKey %in% chimera_rpks,] chimera_reads <- failed_reads[ failed_reads$anchorKey %in% anchor_hits$anchorKey & @@ -854,32 +831,28 @@ if( !is.null(args$chimeras) ){ chimeraData <- list( "read_info" = chimera_reads, - "alignments" = chimera_alignments + "alignments" = chimera_alignments, + "failed_reads" = failed_reads ) writeOutputFile(chimeraData, file = args$chimeras, format = "rds") } -#' Expand out reads uniquely mapped reads or unique sites +#' Expand out uniquely mapped reads or unique sites #' Below, the paired_loci object is expanded to create the genomic alignments #' for each read that mapped to a single genomic loci. This data is then #' recorded in two formats. "allSites" is a GRanges object where each row is a #' single read, while "sites.final" is a condensed form of the data where each #' row is a unique integration site with the width of the range refering to #' the longest template aligned to the reference genome. -uniq_read_loci_mat <- read_loci_mat[ - read_loci_mat$readPairKey %in% uniq_read_pairs, +uniq_templates <- coupled_loci[ + match(unlist(rpk_loci_key[unique_rpks]), coupled_loci$lociPairKey) ] -uniq_templates <- paired_loci[ - match(uniq_read_loci_mat$lociPairKey, paired_loci$lociPairKey) -] - -uniq_templates$readPairKeys <- NULL -uniq_templates$readPairKey <- uniq_read_loci_mat$readPairKey +uniq_templates$readPairKey <- unique_rpks -uniq_keys <- keys[keys$readPairKey %in% uniq_read_pairs,] +uniq_keys <- keys[keys$readPairKey %in% unique_rpks,] uniq_reads <- uniq_templates[ match(uniq_keys$readPairKey, uniq_templates$readPairKey) @@ -903,7 +876,7 @@ writeOutputFile(uniq_sites, file = args$uniqOutput) # Print out head of uniq_sites for reference. printHead( uniq_sites, - title = "Head of uniquely mapped genomic loci.", + title = "Head of uniquely mapped genomic loci", caption = sprintf( paste( "Alignments yeilded %1$s unique anchor sites from %2$s", @@ -944,7 +917,7 @@ if( !is.null(args$condSites) ){ printHead( cond_sites, - title = "Head of unique anchor sites.", + title = "Head of unique anchor sites", caption = sprintf( paste( "There were %1$s unique anchor sites identified with a total", @@ -958,57 +931,44 @@ if( !is.null(args$condSites) ){ } -#' Clean up environment for expansion and clustering of multihits -#rm(uniq_read_loci_mat, uniq_templates, uniq_keys, -# uniq_reads, uniq_sites, ) -#gc() - -#' Group and characterize multihits -#' Multihits are reads that align to multiple locations in the reference -#' genome. There are bound to always be a certain proportion of reads aligning -#' to repeated sequence due to the high level degree of repeated DNA elements -#' within genomes. The final object generated, "multihitData", is a list of -#' three objects. "unclustered_multihits" is a GRanges object where every -#' alignment for every multihit read is present in rows. -#' "clustered_multihit_positions" returns all the possible integration site -#' positions for the multihit. Lastly, "clustered_multihit_lengths" contains the -#' length of the templates mapping to the multihit clusters, used for -#' abundance calculations. +# Clean up environment for expansion and clustering of multihits + +# Group and characterize multihits +# Multihits are reads that align to multiple locations in the reference +# genome. There are bound to always be a certain proportion of reads aligning +# to repeated sequence due to the high level degree of repeated DNA elements +# within genomes. The final object generated, "multihitData", is a list of +# three objects. "unclustered_multihits" is a GRanges object where every +# alignment for every multihit read is present in rows. +# "clustered_multihit_positions" returns all the possible integration site +# positions for the multihit. Lastly, "clustered_multihit_lengths" contains the +# length of the templates mapping to the multihit clusters, used for +# abundance calculations. if( !is.null(args$multihits) ){ unclustered_multihits <- GenomicRanges::GRanges() clustered_multihit_positions <- GenomicRanges::GRangesList() clustered_multihit_lengths <- list() - if( length(multihit_read_pairs) > 0 ){ + if( length(multihit_rpks) > 0 ){ #' Only consider readPairKeys that aligned to multiple genomic loci - multi_read_loci_mat <- read_loci_mat[ - read_loci_mat$readPairKey %in% multihit_read_pairs, + multihit_templates <- coupled_loci[ + coupled_loci$lociPairKey %in% unlist(rpk_loci_key[multihit_rpks]) ] - multihit_templates <- paired_loci[ - paired_loci$lociPairKey %in% multi_read_loci_mat$lociPairKey + multihit_templates <- multihit_templates[ + match(unlist(rpk_loci_key[multihit_rpks]), multihit_templates$lociPairKey) ] - multihit_expansion_map <- multihit_templates$readPairKeys - multihit_templates$readPairKeys <- NULL - - multihit_templates <- multihit_templates[S4Vectors::Rle( - values = seq_along(multihit_templates), - lengths = S4Vectors::lengths(multihit_expansion_map) - )] - - multihit_templates$readPairKey <- unlist(multihit_expansion_map) + multihit_templates$readPairKey <- as.character(S4Vectors::Rle( + values = multihit_rpks, lengths = lengths(rpk_loci_key[multihit_rpks]) + )) - #' As the loci are expanded from the paired_loci object, unique templates + #' As the loci are expanded from the coupled_loci object, unique templates #' and readPairKeys are present in the readPairKeys unlisted from the #' paired_loci object. - multihit_templates <- multihit_templates[ - multihit_templates$readPairKey %in% multi_read_loci_mat$readPairKey - ] - - multihit_keys <- keys[keys$readPairKey %in% multihit_read_pairs,] + multihit_keys <- keys[keys$readPairKey %in% multihit_rpks,] multihit_keys$sampleName <- stringr::str_extract( string = as.character(multihit_keys$anchorSeqID), pattern = "^[\\w-]+" @@ -1038,10 +998,8 @@ if( !is.null(args$multihits) ){ revmap <- multihits_red$revmap axil_nodes <- as.character(S4Vectors::Rle( - values = multihit_templates$readPairKey[ - S4Vectors::start(revmap@partitioning) - ], - lengths = S4Vectors::width(revmap@partitioning) + values = multihit_templates$readPairKey[min(revmap)], + lengths = lengths(revmap) )) nodes <- multihit_templates$readPairKey[unlist(revmap)] @@ -1153,13 +1111,15 @@ if( !is.null(args$multihits) ){ writeOutputFile(multihitData, file = args$multihits, format = "rds") - print( + printHead( data.frame( "multihit_reads" = length(unique(names(unclustered_multihits))), "multihit_alignments" = length(unique(unclustered_multihits)), "multihit_clusters" = length(clustered_multihit_positions), "multihit_lengths" = sum(lengths(clustered_multihit_lengths)) - ) + ), + title = "Multihit metrics", + caption = "Metrics highlighting the observation of multiple aligning reads." ) if( args$stat != FALSE ){ diff --git a/tools/rscripts/evaluate_incorp_data.R b/tools/rscripts/evaluate_incorp_data.R index a3f1ce82..6e5819ac 100644 --- a/tools/rscripts/evaluate_incorp_data.R +++ b/tools/rscripts/evaluate_incorp_data.R @@ -40,6 +40,14 @@ parser$add_argument( ) ) +parser$add_argument( + "--stat", nargs = 1, type = "character", default = FALSE, + help = paste( + "File name to be written in output directory of read couts for each", + "sample. CSV file format. ie. test.stat.csv." + ) +) + parser$add_argument( "-q", "--quiet", action = "store_true", help = "Hide standard output messages." @@ -131,6 +139,7 @@ code_dir <- dirname(sub( )) source(file.path(code_dir, "supporting_scripts/iguide_support.R")) +source(file.path(code_dir, "supporting_scripts/nucleotideScoringMatrices.R")) # Import metadata and consolidate objects ---- @@ -252,18 +261,50 @@ special_genes <- suppressMessages(loadRefFiles( root = root_dir )) -umitag_option <- all(unlist(lapply(configs, "[[", "UMItags"))) +submat <- banmat() -upstream_dist <- unique(sapply(configs, function(x) x$upstreamDist)) -downstream_dist <- unique(sapply(configs, function(x) x$downstreamDist)) +## Determine processing parameters +## Some parameters will need to be an "all or nothing" approach, including: +## - UMItags +## - recoverMultihits +## Depending on these parameters others (upstream/downstream_dist, ...) may need +## to be consistent between runs otherwise, the primary config file (first one), +## will be used for parameterization. -if( length(upstream_dist) > 1 | length(downstream_dist) > 1 ){ - stop( - "\n Inconsistant upstream or downstream distances between config files.\n", - " Comparisons between groups with different run specific criteria\n", - " is not recommended.\n") +umitag_option <- all(unlist(lapply(configs, "[[", "UMItags"))) +multihit_option <- all(unlist(lapply(configs, "[[", "recoverMultihits"))) + +if( multihit_option ){ + + upstream_dist <- unique(sapply(configs, function(x) x$upstreamDist)) + downstream_dist <- unique(sapply(configs, function(x) x$downstreamDist)) + pile_up_min <- unique(sapply(configs, function(x) x$pileUpMin)) + + if( + length(upstream_dist) > 1 | + length(downstream_dist) > 1 | + length(pile_up_min) > 1 + ){ + + stop( + "\n Inconsistant upstream or downstream distances between config files.\n", + " Comparisons between groups with different run specific criteria\n", + " is not recommended when considering the recover multihit option.\n" + ) + + } + +}else{ + + upstream_dist <- configs[[1]]$upstreamDist + downstream_dist <- configs[[1]]$downstreamDist + pile_up_min <- configs[[1]]$pileUpMin + } +max_target_mismatch <- configs[[1]]$maxTargetMismatch + + ## Combine sampleInfo files sample_info <- dplyr::bind_rows(lapply( @@ -310,8 +351,7 @@ if( length(args$support) > 0 ){ stop("\n Cannot find supporting data file: ", args$support, ".\n") } - supp_data <- data.table::fread(support_path, data.table = FALSE) %>% - dplyr::mutate(run_set = "supp_data") + supp_data <- data.table::fread(support_path, data.table = FALSE) specimen_levels <- supp_data$specimen[supp_data$specimen %in% specimen_levels] @@ -332,21 +372,17 @@ if( length(args$support) > 0 ){ ## Identify on-target edit sites from config files -on_targets <- unlist(lapply(configs, "[[", "On_Target_Sites")) -names(on_targets) <- stringr::str_replace( - string = names(on_targets), pattern = stringr::fixed("."), replacement = ":" -) - -names(on_targets) <- stringr::str_extract( - string = names(on_targets), - pattern = "[\\w\\_\\-\\'\\.]+$" +on_targets <- unlist(lapply(configs, "[[", "On_Target_Sites")) %>% + data.frame(id = names(.), target = ., row.names = NULL) %>% + dplyr::mutate( + id = stringr::str_replace( + string = id, pattern = stringr::fixed("."), replacement = ":" + ), + id = stringr::str_extract(string = id, pattern = "[\\w\\_\\-\\'\\.]+$"), + id = stringr::str_extract(string = id, pattern = "[\\w\\_\\-\\.]+") ) %>% - stringr::str_extract(pattern = "[\\w\\_\\-\\.]+") - -on_targets <- structure( - unique(on_targets), - names = names(on_targets)[match(unique(on_targets), on_targets)] -) + dplyr::distinct() %$% + structure(target, names = id) ## Treatment across runs @@ -491,20 +527,20 @@ target_combn <- structure( ) combn_tbl <- data.frame( - run_set = nuclease_treaments$run_set[ - as.vector(match( - S4Vectors::Rle(names(target_combn), lengths(target_combn)), - nuclease_treaments$specimen - )) - ], - nuclease = nuclease_treaments$nuclease[ - as.vector(match( - S4Vectors::Rle(names(target_combn), lengths(target_combn)), - nuclease_treaments$specimen - )) - ], - target = unlist(target_combn), - row.names = NULL + run_set = nuclease_treaments$run_set[ + as.vector(S4Vectors::match( + S4Vectors::Rle(names(target_combn), lengths(target_combn)), + nuclease_treaments$specimen + )) + ], + nuclease = nuclease_treaments$nuclease[ + as.vector(S4Vectors::match( + S4Vectors::Rle(names(target_combn), lengths(target_combn)), + nuclease_treaments$specimen + )) + ], + target = unlist(target_combn), + row.names = NULL ) %>% dplyr::filter(target != "Mock") %>% dplyr::distinct() @@ -532,7 +568,6 @@ target_seqs_df <- data.frame( sequence = as.character(unlist(target_seqs)) ) - ## Identify PAM sequences associated with nucleases pam_seqs <- do.call(c, lapply(configs, function(x){ toupper(unlist(lapply(x$Nuclease_Profiles, "[[", "PAM"))) @@ -557,12 +592,22 @@ pam_seqs_df <- data.frame( considered_target_seqs <- unique(unlist(treatment)) considered_nucleases <- unique(unlist(nuclease)) +on_targets <- on_targets[names(on_targets) %in% considered_target_seqs] + target_tbl <- combn_tbl %>% dplyr::left_join(target_seqs_df, by = c("run_set", "target")) %>% dplyr::left_join(pam_seqs_df, by = c("run_set", "nuclease")) %>% dplyr::filter( target %in% considered_target_seqs & nuclease %in% considered_nucleases - ) + ) + +uniq_target_df <- target_tbl %>% + dplyr::distinct(target, sequence, PAM) + +uniq_target_seqs <- Biostrings::DNAStringSet( + structure(uniq_target_df$sequence, names = uniq_target_df$target), + use.names = TRUE +) ### Log combination treatment table if( !args$quiet ){ @@ -575,7 +620,8 @@ if( !args$quiet ){ if( is.null(args$support) ){ spec_overview <- treatment_df }else{ - spec_overview <- supp_data + spec_overview <- supp_data %>% + dplyr::mutate(run_set = "supp_data") } cond_overview <- spec_overview %>% @@ -590,7 +636,10 @@ cond_overview <- spec_overview %>% dplyr::select(specimen, condition) -## Read in experimental data and contatenate different sets ---- +# Beginnin analysis ---- +if( !args$quiet ) cat("\nStarting analysis...\n") + +## Read in experimental data and contatenate different sets input_data_paths <- lapply(configs, function(x){ name <- x$Run_Name file.path( @@ -609,30 +658,466 @@ input_data <- lapply(input_data_paths, function(x){ }) names(input_data) <- names(configs) -data_type_names <- names(input_data[[1]]) +input_data <- dplyr::bind_rows(input_data, .id = "run.set") %>% + dplyr::mutate( + specimen = stringr::str_extract(sampleName, pattern = "[\\w]+") + ) %>% + dplyr::filter(specimen %in% spec_overview$specimen) + +if( !multihit_option ){ + input_data <- dplyr::filter(input_data, type == "uniq") +} + -input_data <- lapply( - seq_along(input_data[[1]]), - function(i) dplyr::bind_rows(lapply(input_data, "[[", i), .id = "run.set") +## Format input alignments ---- +algnmts <- input_data + +## Determine abundance metrics, with or without UMItags +if( umitag_option ){ + + algnmts <- dplyr::arrange(algnmts, desc(contrib)) %>% + dplyr::group_by(seqnames, start, end, strand, specimen, sampleName) %>% + dplyr::summarise( + count = sum(contrib), + umitag = sum(as.integer(!duplicated(umitag[!is.na(umitag)])) * contrib), + contrib = max(contrib) + ) %>% + dplyr::ungroup() %>% + as.data.frame() + +}else{ + + algnmts <- dplyr::arrange(algnmts, desc(contrib)) %>% + dplyr::group_by(seqnames, start, end, strand, specimen, sampleName) %>% + dplyr::summarize( + count = sum(contrib), + contrib = max(contrib) + ) %>% + dplyr::ungroup() %>% + as.data.frame() + +} + +## Generate a sample table of the data for log purposes +sample_index <- ifelse(nrow(algnmts) > 10, 10, nrow(algnmts)) +sample_index <- sample(seq_len(nrow(algnmts)), sample_index, replace = FALSE) + +cat("\nSample of aligned templates:\n") + +print( + data.frame(algnmts[sample_index,]), + right = FALSE, + row.names = FALSE ) -names(input_data) <- data_type_names +cat(paste0("\nNumber of alignments: ", nrow(algnmts), "\n")) + +rm(sample_index) -input_data <- lapply( - input_data, - function(x) x[x$specimen %in% spec_overview$specimen,] +## Transform the data into a GRanges object +algnmts_gr <- GenomicRanges::GRanges( + seqnames = algnmts$seqnames, + ranges = IRanges::IRanges(start = algnmts$start, end = algnmts$end), + strand = algnmts$strand, + seqinfo = GenomeInfoDb::seqinfo(ref_genome) ) -## Updating on-target data if needed -on_targets <- on_targets[names(on_targets) %in% considered_target_seqs] +if( umitag_option ){ + + GenomicRanges::mcols(algnmts_gr) <- dplyr::select( + algnmts, specimen, sampleName, count, umitag, contrib + ) + +}else{ + + GenomicRanges::mcols(algnmts_gr) <- dplyr::select( + algnmts, specimen, sampleName, count, contrib + ) + +} +# Analyze alignments ---- +## Identify groups of alignments or pileups of aligned fragments +## These pileups give strong experimental evidence of directed incorporation of +## the dsODN into a region. Initially, pileups are identified and then checked +## for pairing, or if there is another pileup on the opposite strand in close +## proximity. +algnmts_gr$clus.ori <- pileupCluster( + gr = algnmts_gr, + grouping = "specimen", + maxgap = 0L, + return = "simple" +) + +algnmts_gr$paired.algn <- identifyPairedAlgnmts( + gr = algnmts_gr, + grouping = "specimen", + maxgap = upstream_dist * 2 +) + +algnmts_grl <- split(algnmts_gr, unlist(nuclease)[algnmts_gr$specimen]) + +annot_clust_info <- dplyr::bind_rows(lapply( + seq_along(algnmts_grl), + function(i, grl){ + + gr <- grl[[i]] + nuc <- names(grl)[i] + + if( !nuc %in% names(nuc_profiles) ){ + nuc_profile <- NULL + }else{ + nuc_profile <- nuc_profiles[[nuc]] + } + + ## Create a GRange with only the unique cluster origins + split_clus_id <- stringr::str_split( + string = unique(paste0(gr$specimen, ":", gr$clus.ori)), + pattern = ":", + simplify = TRUE + ) + + algn_clusters <- GenomicRanges::GRanges( + seqnames = split_clus_id[,2], + ranges = IRanges::IRanges( + start = as.numeric(split_clus_id[,4]), width = 1 + ), + strand = split_clus_id[,3], + seqinfo = GenomeInfoDb::seqinfo(ref_genome) + ) + + algn_clusters$specimen <- split_clus_id[,1] + algn_clusters$clus.ori <- vcollapse(split_clus_id[, 2:4], sep = ":") + + algn_clusters$clus.seq <- getSiteSeqs( + gr = algn_clusters, + upstream.flank = upstream_dist, + downstream.flank = downstream_dist, + ref.genome = ref_genome + ) + + ## Identify which target sequences binding near clusters + if( !is.null(nuc_profile) ){ + + algn_clusters <- compareTargetSeqs( + gr.with.sequences = algn_clusters, + seq.col = "clus.seq", + target.seqs = uniq_target_seqs, + tolerance = max_target_mismatch, + nuc.profile = nuc_profile, + submat = submat, + upstream.flank = upstream_dist, + downstream.flank = downstream_dist + ) + + }else{ + + algn_clusters$target.match <- "No_valid_match" + algn_clusters$target.mismatch <- NA + algn_clusters$target.score <- NA + algn_clusters$aligned.sequence <- NA + algn_clusters$edit.site <- NA + + } + + as.data.frame(GenomicRanges::mcols(algn_clusters)) + + }, + grl = algnmts_grl +)) + + +## Merge the target sequence alignment information from the clusters back to all +## unique alignments +algnmts <- as.data.frame(merge( + x = as.data.frame(algnmts_gr), + y = dplyr::select(annot_clust_info, -clus.seq), + by = c("specimen", "clus.ori") +)) + +## Change guideRNA.match to No_Valid_Match if an inappropriate gRNA is annotated +algnmts$target.match <- filterInappropriateComparisons( + guideRNA.match = algnmts$target.match, + specimen = algnmts$specimen, + treatment = treatment +) + +## Fragment pileups, paired clustering, and guideRNA alignments have been used +## to characterize the incorporation sites analyzed here. Each metric will be +## used to create a list of incorporation sites that may be nuclease cut sites. +## The following identifies which alignments are associated with each of these +## criteria. +tbl_clus_ori <- algnmts %>% + dplyr::group_by(specimen, clus.ori) %>% + dplyr::filter(n() >= pile_up_min) %>% + dplyr::ungroup() %$% + table(clus.ori) + +idx_clus_ori <- which(algnmts$clus.ori %in% names(tbl_clus_ori)) + +tbl_paired_algn <- algnmts %>% + dplyr::filter(!is.na(paired.algn)) %$% + table(paired.algn) + +idx_paired_algn <- which(algnmts$paired.algn %in% names(tbl_paired_algn)) + +idx_matched <- which(algnmts$target.match != "No_valid_match") + +idx_combined <- sort(unique(c(idx_clus_ori, idx_paired_algn, idx_matched))) + +idx_df <- data.frame( + "Type" = c("PileUp", "Paired", "gRNA_Matched", "Combined"), + "Counts" = sapply( + list(idx_clus_ori, idx_paired_algn, idx_matched, idx_combined), + length + ) +) + +cat("\nTable of uniquely aligned template counts:\n") +print(idx_df, right = FALSE, row.names = FALSE) +cat(paste0("\nTotal number of alignments: ", nrow(algnmts), "\n")) + +probable_algns <- algnmts[idx_combined,] + +probable_algns$on.off.target <- ifelse( + probable_algns$edit.site %in% expandPosStr(on_targets), + "On-target", + "Off-target" +) + +cat("\nOn / Off target alignment counts:\n") +print(table(probable_algns$on.off.target)) + + +## Create summary and output formated object related to each of the criteria for +## edited site detection. + +## Matched alignments +matched_algns <- probable_algns[ + probable_algns$target.match != "No_valid_match", + ] + +matched_summary <- matched_algns %>% + dplyr::mutate( + target.match = stringr::str_replace( + string = target.match, + pattern = "\\:\\([\\w]+\\)$", + replacement = "" + ) + ) %>% + dplyr::group_by( + specimen, edit.site, aligned.sequence, target.match, target.mismatch + ) + +if( umitag_option ){ + + matched_summary <- dplyr::summarise( + matched_summary, + on.off.target = paste(sort(unique(on.off.target)), collapse = ";"), + paired.algn = paste(sort(unique(paired.algn)), collapse = ";"), + count = sum(count), + umitag = sum(umitag), + algns = sum(contrib), + orient = paste(sort(unique(as.character(strand))), collapse = ";") + ) + +}else{ + + matched_summary <- dplyr::summarise( + matched_summary, + on.off.target = paste(sort(unique(on.off.target)), collapse = ";"), + paired.algn = paste(sort(unique(paired.algn)), collapse = ";"), + count = sum(count), + algns = sum(contrib), + orient = paste(sort(unique(as.character(strand))), collapse = ";") + ) + +} + +matched_summary <- dplyr::ungroup(matched_summary) %>% + dplyr::arrange(specimen, target.match, desc(algns)) %>% + as.data.frame() + +## Paired alignments +paired_algns <- probable_algns[ + probable_algns$paired.algn %in% names(tbl_paired_algn), + ] + +paired_regions <- paired_algns %>% + dplyr::group_by(specimen, paired.algn, strand) %>% + dplyr::mutate(pos = ifelse(strand == "+", min(start), max(end))) %>% + dplyr::group_by(specimen, paired.algn) + +if( umitag_option ){ + + paired_regions <- dplyr::summarise( + paired_regions, + seqnames = unique(seqnames), + start = min(pos), + end = max(pos), + mid = start + (end-start)/2, + strand = "*", + width = end - start, + count = sum(count), + umitag = sum(umitag), + algns = sum(contrib) + ) %>% + dplyr::ungroup() + +}else{ + + paired_regions <- dplyr::summarise( + paired_regions, + seqnames = unique(seqnames), + start = min(pos), + end = max(pos), + mid = start + (end-start)/2, + strand = "*", + width = end - start, + count = sum(count), + algns = sum(contrib) + ) %>% + dplyr::ungroup() + +} + +if( nrow(paired_regions) > 0 ){ + + paired_regions <- paired_regions %>% + dplyr::group_by(specimen, paired.algn) %>% + dplyr::mutate( + on.off.target = ifelse( + any(sapply( + expandPosStr(unlist(on_targets[ + which( + stringr::str_extract( + names(on_targets), "[\\w\\-\\_\\.]+") %in% + treatment[[specimen]] + ) + ])), + function(x, seq, st, en){ + + match_seq <- seq == stringr::str_extract(x, "[\\w]+") + + within_start <- st <= + as.numeric(stringr::str_extract(x, "[\\w]+$")) + downstream_dist + + within_end <- en >= + as.numeric(stringr::str_extract(x, "[\\w]+$")) - downstream_dist + + match_seq & within_start & within_end + + }, + seq = seqnames, + st = start, + en = end + )), + "On-target", + "Off-target" + ) + ) %>% + dplyr::ungroup() %>% + as.data.frame() + +}else{ + + paired_regions <- dplyr::mutate( + paired_regions, + on.off.target = vector(mode = "character") + ) + +} + +## Pile up alignments +pile_up_algns <- probable_algns[ + probable_algns$clus.ori %in% names(tbl_clus_ori), +] + +pile_up_summary <- pile_up_algns %>% + dplyr::mutate( + target.match = stringr::str_replace( + string = target.match, + pattern = "\\:\\([\\w]+\\)$", + replacement = "" + ) + ) %>% + dplyr::group_by(specimen, clus.ori) + +if( umitag_option ){ + + pile_up_summary <- dplyr::summarise( + pile_up_summary, + on.off.target = paste(sort(unique(on.off.target)), collapse = ";"), + paired.algn = paste(sort(unique(paired.algn)), collapse = ";"), + count = sum(count), + umitag = sum(umitag), + algns = sum(contrib) + ) + +}else{ + + pile_up_summary <- dplyr::summarise( + pile_up_summary, + on.off.target = paste(sort(unique(on.off.target)), collapse = ";"), + paired.algn = paste(sort(unique(paired.algn)), collapse = ";"), + count = sum(count), + algns = sum(contrib) + ) + +} + +pile_up_summary <- dplyr::ungroup(pile_up_summary) %>% + dplyr::arrange(specimen, desc(algns)) %>% + as.data.frame() + + +# Generate stats if requested ---- +## If requested, generate stats from the analysis for qc. + +if( args$stat != FALSE ){ + + stat_summary <- function(x, y){ + + x %>% + dplyr::mutate(metric = y) %>% + dplyr::group_by(sampleName, metric) %>% + dplyr::summarize(count = sum(contrib)) %>% + dplyr::ungroup() + + } + + total_stat <- stat_summary(algnmts, "total.algns") + combined_stat <- stat_summary(probable_algns, "combined.algns") + pileup_stat <- stat_summary(pile_up_algns, "pileup.algns") + paired_stat <- stat_summary(paired_algns, "paired.algns") + matched_stat <- stat_summary(matched_algns, "matched.algns") + + on_tar_stat <- dplyr::filter( + matched_algns, on.off.target == "On-target" + ) %>% + stat_summary("ontarget.algns") + + off_tar_stat <- dplyr::filter( + matched_algns, on.off.target == "Off-target" + ) %>% + stat_summary("offtarget.algns") + + stat <- dplyr::bind_rows( + total_stat, combined_stat, pileup_stat, paired_stat, + matched_stat, on_tar_stat, off_tar_stat) + + write.table( + x = stat, file = args$stat, + sep = ",", row.names = FALSE, + col.names = FALSE, quote = FALSE + ) + +} -# Beginnin analysis ---- -if( !args$quiet ) cat("\nStarting analysis...\n") ## Specimen summary ---- # Summarize components and append to specimen table -tbl_algn_counts <- input_data$algnmts %>% +tbl_algn_counts <- algnmts %>% dplyr::mutate(specimen = factor(specimen, levels = specimen_levels)) %>% dplyr::group_by(specimen) @@ -659,8 +1144,8 @@ spec_overview <- dplyr::left_join( ## Annotate incorporation data ---- -input_data$matched_summary <- suppressMessages(dplyr::mutate( - input_data$matched_summary, +matched_summary <- suppressMessages(dplyr::mutate( + matched_summary, gene_id = assignGeneID( seqnames = stringr::str_extract(edit.site, "[\\w]+"), positions = as.numeric(stringr::str_extract(edit.site, "[\\w]+$")), @@ -671,8 +1156,8 @@ input_data$matched_summary <- suppressMessages(dplyr::mutate( ) )) -input_data$paired_regions <- suppressMessages(dplyr::mutate( - input_data$paired_regions, +paired_regions <- suppressMessages(dplyr::mutate( + paired_regions, gene_id = assignGeneID( seqnames = seqnames, positions = mid, @@ -683,8 +1168,8 @@ input_data$paired_regions <- suppressMessages(dplyr::mutate( ) )) -input_data$pile_up_summary <- suppressMessages(dplyr::mutate( - input_data$pile_up_summary, +pile_up_summary <- suppressMessages(dplyr::mutate( + pile_up_summary, gene_id = assignGeneID( seqnames = stringr::str_extract(clus.ori, "[\\w]+"), positions = as.numeric(stringr::str_extract(clus.ori, "[\\w]+$")), @@ -698,7 +1183,7 @@ input_data$pile_up_summary <- suppressMessages(dplyr::mutate( ## On-target summary ---- # Algnmts -tbl_ot_algn <- input_data$algnmts %>% +tbl_ot_algn <- algnmts %>% dplyr::mutate( specimen = factor(specimen, levels = sort(unique(sample_info$specimen))) ) %>% @@ -716,7 +1201,7 @@ tbl_ot_algn <- input_data$algnmts %>% as.data.frame() # Probable edited sites -tbl_ot_prob <- input_data$probable_algns %>% +tbl_ot_prob <- probable_algns %>% dplyr::mutate( specimen = factor(specimen, levels = sort(unique(sample_info$specimen))) ) %>% @@ -734,7 +1219,7 @@ tbl_ot_prob <- input_data$probable_algns %>% as.data.frame() # Pile ups of read alignments -tbl_ot_pile <- input_data$pile_up_algns %>% +tbl_ot_pile <- pile_up_algns %>% dplyr::mutate( specimen = factor(specimen, levels = sort(unique(sample_info$specimen))) ) %>% @@ -752,7 +1237,7 @@ tbl_ot_pile <- input_data$pile_up_algns %>% as.data.frame() # Paired or flanking algnments -tbl_ot_pair <- input_data$paired_regions %>% +tbl_ot_pair <- paired_regions %>% dplyr::mutate( specimen = factor(specimen, levels = sort(unique(sample_info$specimen))), on.off.target = factor( @@ -771,7 +1256,7 @@ tbl_ot_pair <- input_data$paired_regions %>% as.data.frame() # Guide RNA matched within 6 mismatches -tbl_ot_match <- input_data$matched_summary %>% +tbl_ot_match <- matched_summary %>% dplyr::mutate( specimen = factor(specimen, levels = sort(unique(sample_info$specimen))) ) %>% @@ -787,6 +1272,28 @@ tbl_ot_match <- input_data$matched_summary %>% dplyr::ungroup() %>% as.data.frame() +tbl_ot_eff <- matched_summary %>% + dplyr::mutate( + specimen = factor(specimen, levels = sort(unique(sample_info$specimen))) + ) %>% + dplyr::group_by(specimen, on.off.target, target.match) %>% + dplyr::summarise(cnt = sum(algns)) %>% + dplyr::ungroup() %>% + dplyr::group_by(specimen, target.match) %>% + dplyr::summarise( + ot_eff_pct = 100 * sum(ifelse(on.off.target == "On-target", cnt, 0)) / + sum(cnt) + ) %>% + dplyr::ungroup() %>% + tidyr::spread(key = target.match, value = ot_eff_pct) %>% + tidyr::complete(specimen) %>% + as.data.frame() %>% + dplyr::left_join( + cond_overview, by = "specimen" + ) %>% + dplyr::select(specimen, condition, dplyr::everything()) + + # Summary table ot_tbl_summary <- dplyr::left_join( treatment_df, cond_overview, by = "specimen" @@ -813,7 +1320,7 @@ ot_tbl_summary <- Reduce( ## On-target incorporation distribution ---- -on_tar_dists <- input_data$matched_algns %>% +on_tar_dists <- matched_algns %>% dplyr::filter(on.off.target == "On-target") %>% dplyr::mutate( target = stringr::str_extract(string = target.match, pattern = "[\\w]+"), @@ -825,7 +1332,7 @@ on_tar_dists <- input_data$matched_algns %>% ) %>% dplyr::left_join(cond_overview, by = "specimen") %>% dplyr::select( - run.set, specimen, target, condition, + specimen, target, condition, edit.site, edit.site.dist, strand, contrib) on_tar_dens <- lapply( @@ -877,7 +1384,7 @@ sites_included <- on_tar_dists %>% ## Off-target summary ---- # All alignments -tbl_ft_algn <- input_data$algnmts %>% +tbl_ft_algn <- algnmts %>% dplyr::mutate( specimen = factor(specimen, levels = sort(unique(sample_info$specimen))) ) %>% @@ -888,7 +1395,7 @@ tbl_ft_algn <- input_data$algnmts %>% as.data.frame() # Probable edit sites -tbl_ft_prob <- input_data$probable_algns %>% +tbl_ft_prob <- probable_algns %>% dplyr::mutate( specimen = factor(specimen, levels = sort(unique(sample_info$specimen))) ) %>% @@ -899,7 +1406,7 @@ tbl_ft_prob <- input_data$probable_algns %>% as.data.frame() # Pile ups -tbl_ft_pile <- input_data$pile_up_algns %>% +tbl_ft_pile <- pile_up_algns %>% dplyr::mutate( specimen = factor(specimen, levels = sort(unique(sample_info$specimen))) ) %>% @@ -910,7 +1417,7 @@ tbl_ft_pile <- input_data$pile_up_algns %>% as.data.frame() # Paired or flanked loci -tbl_ft_pair <- input_data$paired_regions %>% +tbl_ft_pair <- paired_regions %>% dplyr::mutate( specimen = factor(specimen, levels = sort(unique(sample_info$specimen))) ) %>% @@ -921,7 +1428,7 @@ tbl_ft_pair <- input_data$paired_regions %>% as.data.frame() # target sequence matched -tbl_ft_match <- input_data$matched_summary %>% +tbl_ft_match <- matched_summary %>% dplyr::mutate( specimen = factor(specimen, levels = sort(unique(sample_info$specimen))) ) %>% @@ -931,7 +1438,7 @@ tbl_ft_match <- input_data$matched_summary %>% dplyr::ungroup() %>% as.data.frame() -# Summary table +# Off-target summary table ft_tbl_summary <- dplyr::left_join( treatment_df, cond_overview, by = "specimen" ) %>% @@ -951,10 +1458,39 @@ ft_tbl_summary <- Reduce( dplyr::arrange(specimen) %>% dplyr::select(-treatment) +# Evaluation summary ---- +ot_eff_range <- tbl_ot_eff %>% + tidyr::gather(key = "target", value = "eff", -specimen, -condition) %>% + dplyr::group_by(specimen, condition) %>% + dplyr::summarise( + min = round(min(eff, na.rm = TRUE), digits = 1), + max = round(max(eff, na.rm = TRUE), digits = 1), + eff_rg = ifelse( + min == max, + sprintf("%.1f%%", max), + sprintf("%1$.1f - %2$.1f%%", min, max) + ) + ) %>% + dplyr::ungroup() %>% + dplyr::mutate(eff_rg = ifelse(grepl("Inf", eff_rg), NA, eff_rg)) %>% + dplyr::select(-min, -max) + +eval_summary <- ot_eff_range %>% + dplyr::left_join( + ft_tbl_summary, by = c("specimen", "condition") + ) %>% + dplyr::left_join( + tbl_algn_counts, by = "specimen" + ) %>% + dplyr::select(specimen, condition, Alignments, eff_rg, ft_match) %>% + dplyr::rename( + Specimen = specimen, Condition = condition, + "On-target\nEfficiency" = eff_rg, "Predicted\nOff-targets" = ft_match + ) ## Onco-gene enrichment analysis ---- rand_sites <- selectRandomSites( - num = nrow(input_data$paired_regions) + nrow(input_data$matched_summary), + num = nrow(paired_regions) + nrow(matched_summary), ref.genome = ref_genome, drop.extra.seqs = TRUE, rnd.seed = 1 @@ -977,9 +1513,9 @@ rand_df <- data.frame( ) paired_list <- split( - x = input_data$paired_regions, + x = paired_regions, f = cond_overview$condition[ - match(input_data$paired_regions$specimen, cond_overview$specimen) + match(paired_regions$specimen, cond_overview$specimen) ] ) @@ -1000,9 +1536,9 @@ paired_df <- dplyr::bind_rows( ) matched_list <- split( - x = input_data$matched_summary, + x = matched_summary, f = cond_overview$condition[ - match(input_data$matched_summary$specimen, cond_overview$specimen) + match(matched_summary$specimen, cond_overview$specimen) ]) matched_df <- dplyr::bind_rows( @@ -1096,7 +1632,7 @@ enrich_df <- enrich_df %>% ) ## Off-target sequence analysis ---- -ft_MESL <- input_data$matched_algns %>% +ft_MESL <- matched_algns %>% dplyr::mutate( edit.site.dist = abs(ifelse( strand == "+", @@ -1116,8 +1652,8 @@ if( nrow(ft_MESL) > 0 ){ ESL = predictESProb( x = edit.site.dist, density = on_tar_dens[[condition]] ), - gene_id = input_data$matched_summary$gene_id[ - match(edit.site, input_data$matched_summary$edit.site) + gene_id = matched_summary$gene_id[ + match(edit.site, matched_summary$edit.site) ] ) %>% dplyr::group_by(condition, edit.site, gene_id) %>% @@ -1133,7 +1669,7 @@ if( nrow(ft_MESL) > 0 ){ } -ft_seqs <- input_data$matched_summary %>% +ft_seqs <- matched_summary %>% dplyr::select( specimen, aligned.sequence, target.match, edit.site, target.mismatch, on.off.target, algns, gene_id @@ -1144,6 +1680,7 @@ ft_seqs <- input_data$matched_summary %>% dplyr::left_join(cond_overview, by = "specimen") %>% dplyr::left_join(ft_MESL, by = c("condition", "edit.site", "gene_id")) + if( is.null(args$support) ){ ft_seqs <- dplyr::group_by( @@ -1164,6 +1701,7 @@ if( is.null(args$support) ){ } + ft_seqs <- dplyr::arrange( ft_seqs, desc(algns), desc(MESL), target.mismatch ) %>% @@ -1233,10 +1771,21 @@ saveRDS( "spec_overview" = spec_overview, "cond_overview" = cond_overview ), - "incorp_data" = input_data, + "incorp_data" = list( + "algnmts" = algnmts, + "probable_algns" = probable_algns, + "matched_algns" = matched_algns, + "matched_summary" = matched_summary, + "paired_algns" = paired_algns, + "paired_regions" = paired_regions, + "pile_up_algns" = pile_up_algns, + "pile_up_summary" = pile_up_summary + ), "summary_tbls" = list( - "ot_tbl_summary" = ot_tbl_summary, - "ft_tbl_summary" = ft_tbl_summary + "ot_tbl_summary" = ot_tbl_summary, + "ot_eff_summary" = tbl_ot_eff, + "ft_tbl_summary" = ft_tbl_summary, + "eval_summary" = eval_summary ), "edit_models" = list( "on_tar_dists" = on_tar_dists, @@ -1254,11 +1803,16 @@ saveRDS( ) if( !file.exists(args$output) ){ + stop("\n Cannot verify existence of output file:\n ", args$output, "\n") + }else{ + if( !args$quiet ){ cat("Evaluation complete, output writen to:\n ", args$output, "\n") } + q(status = 0) + } diff --git a/tools/rscripts/generate_iGUIDE_report.R b/tools/rscripts/generate_iGUIDE_report.R index b6e16647..ed4af66e 100644 --- a/tools/rscripts/generate_iGUIDE_report.R +++ b/tools/rscripts/generate_iGUIDE_report.R @@ -269,16 +269,43 @@ signature <- paste( unique(sort(unlist(lapply(configs, "[[", "signature")))), collapse = ", ") -umitag_option <- all(unlist(lapply(configs, "[[", "UMItags"))) +## Determine processing parameters +## Some parameters will need to be an "all or nothing" approach, including: +## - UMItags +## - recoverMultihits +## Depending on these parameters others (upstream/downstream_dist, ...) may need +## to be consistent between runs otherwise, the primary config file (first one), +## will be used for parameterization. -upstream_dist <- unique(sapply(configs, function(x) x$upstreamDist)) -downstream_dist <- unique(sapply(configs, function(x) x$downstreamDist)) +umitag_option <- all(unlist(lapply(configs, "[[", "UMItags"))) +multihit_option <- all(unlist(lapply(configs, "[[", "recoverMultihits"))) -if( length(upstream_dist) > 1 | length(downstream_dist) > 1 ){ - stop( - "Inconsistant upstream or downstream distances between config files.\n", - "Comparisons between groups with different run specific criteria\n", - "is not recommended.") +if( multihit_option ){ + + upstream_dist <- unique(sapply(configs, function(x) x$upstreamDist)) + downstream_dist <- unique(sapply(configs, function(x) x$downstreamDist)) + pile_up_min <- unique(sapply(configs, function(x) x$pileUpMin)) + + if( + length(upstream_dist) > 1 | + length(downstream_dist) > 1 | + length(pile_up_min) > 1 + ){ + + stop( + "\n Inconsistant upstream or downstream distances between config files.\n", + " Comparisons between groups with different run specific criteria\n", + " is not recommended when considering the recover multihit option.\n" + ) + + } + +}else{ + + upstream_dist <- configs[[1]]$upstreamDist + downstream_dist <- configs[[1]]$downstreamDist + pile_up_min <- configs[[1]]$pileUpMin + } ## Combine sampleInfo files @@ -295,7 +322,7 @@ target_tbl <- eval_data$spec_info$target_tbl %>% dplyr::distinct() %>% dplyr::rename( "Nuclease" = nuclease, - "Target" = target, + "Target Name" = target, "Sequence" = sequence ) @@ -363,6 +390,9 @@ ot_tbl_summary <- eval_data$summary_tbls$ot_tbl_summary[ "ot_pile_pct", "ot_pair_pct", "ot_match_pct") ] +ot_eff_summary <- eval_data$summary_tbls$ot_eff_summary + +eval_summary <- eval_data$summary_tbls$eval_summary # On-target distribution of incorporations ---- on_tar_dists <- eval_data$edit_models$on_tar_dists @@ -379,10 +409,7 @@ ft_seqs_list <- eval_data$ft_data # Onco-gene enrichment analysis ---- -enrich_df <- eval_data$enrich_data$enrich_df %>% - dplyr::select( - origin, condition, total, onco, onco.p.value, special, special.p.value - ) +enrich_df <- eval_data$enrich_data$enrich_df # Data passed to Rmd for report generation ---- diff --git a/tools/rscripts/generate_iGUIDE_summary.R b/tools/rscripts/generate_iGUIDE_summary.R index 7c2e174a..f0b7378e 100644 --- a/tools/rscripts/generate_iGUIDE_summary.R +++ b/tools/rscripts/generate_iGUIDE_summary.R @@ -206,6 +206,18 @@ null <- catOrWrite( args ) +# Analysis overview table ---- +eval_summary <- eval_data$summary_tbls$eval_summary +eval_summary[is.na(eval_summary)] <- 0 + +null <- catOrWrite( + "Table 1. Analysis overview with specific data highlights.", + args +) + +null <- catOrWrite(eval_summary, args, missing = 0, style = "multiline") + +null <- catOrWrite("", args) # Specimen summary table ---- specimen_levels <- eval_data$params$specimen_levels @@ -239,7 +251,7 @@ spec_overview <- eval_data$incorp_data$algnmts %>% dplyr::left_join(spec_overview, ., by = "specimen") null <- catOrWrite( - "Table 1. Specimen overview covering reads, umitags, and alignments:", + "Table 2. Specimen overview covering reads, umitags, and alignments:", args ) @@ -254,12 +266,12 @@ target_tbl <- eval_data$spec_info$target_tbl %>% dplyr::distinct() %>% dplyr::rename( "Nuclease" = nuclease, - "Target" = target, + "Target Name" = target, "Sequence" = sequence ) %>% dplyr::mutate( "Edit Loci" = sapply( - Target, + `Target Name`, function(x){ paste(on_targets[which(names(on_targets) == x)], collapse = "\n") } @@ -267,7 +279,7 @@ target_tbl <- eval_data$spec_info$target_tbl %>% ) null <- catOrWrite( - "Table 2. Target pattern table specifying sequences and edited loci:", + "Table 3. Target pattern table specifying sequences and edited loci:", args ) @@ -289,7 +301,7 @@ names(ot_tbl_summary) <- c( ) null <- catOrWrite( - "Table 3. On-target editing percentages based on alignment criteria:", + "Table 4. On-target editing percentages based on alignment criteria:", args ) @@ -330,7 +342,7 @@ on_tar_dist_summary <- on_tar_dists %>% ) null <- catOrWrite( - "Table 4. On-target incorporation profile, quantile counts given in % columns:", + "Table 5. On-target incorporation profile, quantile counts given in % columns:", args ) @@ -338,6 +350,18 @@ null <- catOrWrite(on_tar_dist_summary, args, style = "multiline", missing = 0) null <- catOrWrite("", args) +# On-target editing efficiency ---- +ot_eff_summary <- eval_data$summary_tbls$ot_eff_summary +names(ot_eff_summary)[c(1,2)] <- c("Specimen", "Condition") + +null <- catOrWrite( + "Table 6. Estimate of On-target editing efficiency (percent) for each target by specimen.", + args +) + +null <- catOrWrite(ot_eff_summary, args, style = "multiline", missing = "-") + +null <- catOrWrite("", args) # Off-target summary ---- ft_tbl_summary <- eval_data$summary_tbls$ft_tbl_summary[ @@ -350,7 +374,7 @@ names(ft_tbl_summary) <- c( ) null <- catOrWrite( - "Table 5. Off-target loci counts from criteria-based alignments:", + "Table 7. Off-target loci counts from criteria-based alignments:", args ) @@ -382,7 +406,7 @@ names(enrich_df) <- c( ) null <- catOrWrite( - "Table 6. Off-target gene enrichment:", + "Table 8. Off-target gene enrichment:", args ) @@ -420,19 +444,42 @@ if( nrow(enrich_df) > 0 ){ null <- catOrWrite("", args) # Off-target sequence analysis ---- +nuc_profiles <- eval_data$spec_info$nuclease_profiles ft_seqs_list <- eval_data$ft_data +full_target_seqs <- structure( + sapply(seq_len(nrow(target_tbl)), function(i){ + + nuc <- target_tbl$Nuclease[i] + sequence <- target_tbl$Sequence[i] + + ifelse( + nuc_profiles[[nuc]]$PAM_Loc == "3p", + paste0(sequence, nuc_profiles[[nuc]]$PAM), + ifelse( + nuc_profiles[[nuc]]$PAM_Loc == "5p", + paste0(nuc_profiles[[nuc]]$PAM, sequence), + sequence + ) + ) + + }), + names = target_tbl$`Target Name` +) + null <- lapply(seq_along(ft_seqs_list), function(i){ - null <- catOrWrite(paste0("Table ", i+6, ". Off-Target Loci:"), args) + null <- catOrWrite(paste0("Table ", i+8, ". Off-Target Loci:"), args) null <- catOrWrite(paste0(" Condition : ", names(ft_seqs_list)[i]), args) + target_ref_seq <- full_target_seqs[unique(ft_seqs_list[[i]]$target.seq)] + null <- dplyr::select( ft_seqs_list[[i]], target, gene_id, edit.site, aligns, MESL, aligned.sequence, mismatch ) %>% dplyr::mutate( - aligned.sequence = divSeq(aligned.sequence, aligned.sequence[1]) + aligned.sequence = divSeq(aligned.sequence, target_ref_seq) ) %>% dplyr::rename( "Target" = target, "Gene ID" = gene_id, "Edit Site" = edit.site, diff --git a/tools/rscripts/write_stat_report.R b/tools/rscripts/generate_stat_report.R similarity index 88% rename from tools/rscripts/write_stat_report.R rename to tools/rscripts/generate_stat_report.R index c461e615..a553bad8 100644 --- a/tools/rscripts/write_stat_report.R +++ b/tools/rscripts/generate_stat_report.R @@ -11,7 +11,7 @@ # iguide installation directory, will look for sys argument 'IGUIDE_DIR' # -options(stringsAsFactors = FALSE, scipen = 99, width = 180) +options(stringsAsFactors = FALSE, scipen = 99, width = 120) args <- commandArgs(trailingOnly = TRUE) @@ -22,10 +22,11 @@ code_dir <- dirname(sub( )) # Check input file ---- -input_file <- args[1] +core_file <- args[1] +eval_file <- args[2] -if( !file.exists(input_file) ){ - stop("\n Cannot find input stat file. Check inputs.") +if( !file.exists(core_file) | !file.exists(eval_file) ){ + stop("\n Cannot find input stat files. Check inputs.") } # Check output file ---- @@ -134,7 +135,10 @@ build_version <- list.files(file.path(iguide_dir, "etc")) %>% signature <- config[["signature"]] # Load input data ---- -stat_df <- read.csv(input_file) %>% +core_stat_df <- read.csv(core_file) +eval_stat_df <- read.csv(eval_file) + +stat_df <- dplyr::full_join(core_stat_df, eval_stat_df, by = "sampleName") %>% dplyr::mutate_all(function(x) ifelse(is.na(x), rep(0, length(x)), x)) sampleName_levels <- unique(stat_df$sampleName) @@ -165,9 +169,9 @@ names(read_tbl) <- stringr::str_replace(names(read_tbl), ".reads$", "") # Alignment outcome table ---- algn_tbl <- dplyr::select( - stat_df, sampleName, align.unique.reads, align.unique.algns, align.unique.loci, - align.multihit.reads, align.multihit.lengths, align.multihit.clusters, - align.chimera.reads + stat_df, sampleName, align.unique.reads, align.unique.algns, + align.unique.loci, align.multihit.reads, align.multihit.lengths, + align.multihit.clusters, align.chimera.reads ) %>% dplyr::filter(sampleName %in% sampleNames) %>% dplyr::mutate(sampleName = factor(sampleName, levels = sampleNames)) %>% @@ -178,15 +182,15 @@ names(algn_tbl) <- stringr::str_replace(names(algn_tbl), "align.", "") # Incorporation breakdown table ---- incorp_tbl <- dplyr::select( - stat_df, sampleName, assim.total.algns, assim.combined.algns, - assim.pileup.algns, assim.paired.algns, assim.matched.algns, - assim.ontarget.algns, assim.offtarget.algns + stat_df, sampleName, eval.total.algns, eval.combined.algns, + eval.pileup.algns, eval.paired.algns, eval.matched.algns, + eval.ontarget.algns, eval.offtarget.algns ) %>% dplyr::filter(sampleName %in% sampleNames) %>% dplyr::mutate(sampleName = factor(sampleName, levels = sampleNames)) %>% dplyr::arrange(sampleName) -names(incorp_tbl) <- stringr::str_replace(names(incorp_tbl), "assim.", "") +names(incorp_tbl) <- stringr::str_replace(names(incorp_tbl), "eval.", "") names(incorp_tbl) <- stringr::str_replace(names(incorp_tbl), ".algns$", "") diff --git a/tools/rscripts/report_templates/iGUIDE_report_template.Rmd b/tools/rscripts/report_templates/iGUIDE_report_template.Rmd index 1749dcae..742e4830 100644 --- a/tools/rscripts/report_templates/iGUIDE_report_template.Rmd +++ b/tools/rscripts/report_templates/iGUIDE_report_template.Rmd @@ -88,10 +88,23 @@ if( args$figures != FALSE ){ report_format <- ifelse(args$format == "pdf", "latex", "html") # Helpful functions -pNums <- function(x, ...){ +pNums <- function(x, na.replace = 0, ...){ - x <- ifelse(is.na(x), 0, x) - format(x, big.mark = ",", ...) + if( is.numeric(na.replace) )x <- ifelse(is.na(x), na.replace, x) + + x <- format(x, big.mark = ",", ...) + + if( is.character(na.replace) ){ + replace_idx <- which(stringr::str_trim(x) == "NA") + char_width <- unique(nchar(x)) + x[replace_idx] <- stringr::str_pad( + string = rep(na.replace, length(replace_idx)), + width = char_width - nchar(na.replace), + side = "left" + ) + } + + x } @@ -129,6 +142,7 @@ latex_options <- c("hold_position", "repeat_header") # Captions tbl_caps <- c( + "Analysis summary.", "Specimen summary.", "Target sequences and associated information.", "Percent On-target.", @@ -137,13 +151,20 @@ tbl_caps <- c( ) fig_caps <- c( - "Distance distribution of observed incorporation sites from On-target loci.", "Genomic distribution of incorporation sites by bioinformatic characteristics.", - "Sequence similarity between off-target sites and targeting sequence(s)." + "Sequence similarity between off-target sites and targeting sequence(s).", + "Distance distribution of observed incorporation sites from On-target loci." ) if( args$format == "html" ){ - tbl_caps <- paste0("Table ", 1:5, ". ", tbl_caps) + + tbl_nums <- if( nrow(target_tbl) == 1 ){ + tbl_nums <- 1:6 + }else{ + tbl_nums <- c(1:4,6,7) + } + + tbl_caps <- paste0("Table ", tbl_nums, ". ", tbl_caps) } fig_caps <- paste0("**Figure ", 1:3, "**. ", fig_caps) @@ -171,7 +192,34 @@ if( args$format == "pdf" ){ # Summary -The following document summarizes the results of processing `r gsub("_", "-", set_names)` sequencing set(s) through the iGUIDE pipeline. Included in this document are explanations of the data analytics as well as tables and graphics of the data obtained from the sequence analysis. This report includes `r length(unique(sample_info$specimen))` specimens treated with `r nrow(target_tbl)` targeting sequences. A total of `r pNums(round(sum(incorp_data$algnmts$count)))` reads are considered in this analysis, which represent `r pNums(round(sum(incorp_data$algnmts$contrib)))` inferred cells sampled. +The following document summarizes the results of processing `r gsub("_", "-", set_names)` sequencing set(s) through the iGUIDE pipeline. Included in this document are explanations of the data analytics as well as tables and graphics of the data obtained from the sequence analysis. This report includes `r length(unique(sample_info$specimen))` specimens treated with `r nrow(target_tbl)` targeting sequences. A total of `r pNums(round(sum(incorp_data$algnmts$count)))` reads are considered in this analysis, which represent `r pNums(round(sum(incorp_data$algnmts$contrib)))` observed incorporated double-stranded oligo-dinucleotides (dsODNs, a unit of measure associated with iGUIDE or GUIDE-seq based analyses). + +**Table 1** highlights some key information from the data analysis for each specimen, including the total number of alignments (representing the observed incorporated dsODNs), an estimated range of On-target editing efficiency, and the number of predicted off-target sites. + +```{r report_summary} +eval_summary[is.na(eval_summary)] <- 0 + +if( args$tables ){ + + write.csv( + x = eval_summary, + file = file.path(tables_path, "tbl1.report.summary.csv"), + quote = TRUE, row.names = FALSE + ) + +} + +kable( + x = eval_summary, format = report_format, row.names = FALSE, + booktabs = TRUE, longtable = TRUE, escape = TRUE, caption = tbl_caps[1], + format.args = list(big.mark = ","), align = c("l", "l", "r", "c", "r") + ) %>% + kableExtra::kable_styling( + bootstrap_options = bootstrap_options, + latex_options = latex_options + ) + +``` ```{r} if( args$format == "pdf" ){ @@ -185,12 +233,13 @@ if( args$format == "pdf" ){ ```{r spec_summary} spec_overview[is.na(spec_overview)] <- 0 +names(spec_overview)[1] <- "Specimen" if( args$tables ){ write.csv( x = spec_overview, - file = file.path(tables_path, "tbl1.specimen.overview.csv"), + file = file.path(tables_path, "tbl2.specimen.overview.csv"), quote = TRUE, row.names = FALSE ) @@ -198,7 +247,7 @@ if( args$tables ){ kable( x = spec_overview, format = report_format, row.names = FALSE, - booktabs = TRUE, longtable = TRUE, escape = FALSE, caption = tbl_caps[1], + booktabs = TRUE, longtable = TRUE, escape = FALSE, caption = tbl_caps[2], format.args = list(big.mark = ",") ) %>% kableExtra::kable_styling( @@ -210,7 +259,7 @@ kable( Each specimen started in the iGUIDE pipeline as genomic DNA. The gDNA was randomly sheared through ultrasonication and ligated with barcoded DNA linkers. Nested-PCR was used to amplify from incorporated dsODN sequences to the linker sequences with barcoded and linker-specific primers. This dual barcoding reduces sample to sample crossover. Amplicons were sequenced on an Illumina platform and the sequencing data processed with the iGUIDE software, available on [**GitHub@cnobles/iGUIDE**](https://github.com/cnobles/iGUIDE). -DNA sequence reads were aligned to the `r unique(sapply(configs, "[[", "Ref_Genome"))` reference genome. The number of reads aligning for each specimen is displayed in **Table 1**, along with the number of unique alignments they represent (the inferred cells sampled). Multiple reads may represent a singular alignment of genomic DNA, inherent to sequence analysis of amplified DNA. These alignments indicate individual events of dsODN incorporation and clonal expansion. +DNA sequence reads were aligned to the `r unique(sapply(configs, "[[", "Ref_Genome"))` reference genome. The number of reads aligning for each specimen is displayed in **Table 2**, along with the number of unique "alignments" they represent (or the number of observed incorporated dsODNs). Multiple reads may represent a singular alignment of genomic DNA, inherent to sequence analysis of amplified DNA. These alignments indicate individual events of dsODN incorporation and/or clonal expansion. Alternatively, random nucleotide sequences are included in the ligated linker sequences. These Unique Molecular Indeces (UMItags) can provide another method of abundance by counting the number of UMItags and breakpoint position combinations for each incorporation sites. This method of quantification has an increased dynamic range, yet can suffer from PCR artifacts leading to inflated abundances. @@ -224,11 +273,11 @@ if( args$format == "pdf" ){ # On-target analysis -Incorporation sites, or locations in the genome where the dsODN was detected, are expected to be in the proximity of nuclease targeted locations. The target sequences provided for these analyses and their On-target locations (loci) are shown in **Table 2**. The genomic locations are in a format where chromosome, orientation, and nucleotide position are delimited by a colon (":"). +Incorporation sites, or locations in the genome where the dsODN was detected, are expected to be in the proximity of nuclease targeted locations. The target sequences provided for these analyses and their On-target locations (`Edit Locus`) are shown in **Table 3**. The genomic locations are in a format where chromosome, orientation, and nucleotide position are delimited by a colon (":"). ```{r target_tbl} target_tbl$`Edit Locus` <- sapply( - target_tbl$Target, + target_tbl$`Target Name`, function(x) paste(on_targets[which(names(on_targets) == x)], collapse = "\n") ) @@ -239,7 +288,7 @@ if( args$tables ){ "Edit Locus" = gsub("\n", ";", `Edit Locus`) ) %>% write.csv( - file = file.path(tables_path, "tbl2.target.sequences.csv"), + file = file.path(tables_path, "tbl3.target.sequences.csv"), quote = TRUE, row.names = FALSE ) @@ -273,7 +322,7 @@ if( report_format == "html" ){ kable( x = target_print, format = report_format, row.names = FALSE, - booktabs = TRUE, longtable = TRUE, escape = FALSE, caption = tbl_caps[2], + booktabs = TRUE, longtable = TRUE, escape = FALSE, caption = tbl_caps[3], align = "c" ) %>% kableExtra::kable_styling( @@ -284,15 +333,15 @@ kable( ``` -Analysis of On-target associated incorporation sites (**Table 3**) produces several features that are helpful in On- and Off-target site characterization. These include the following: +Analysis of On-target associated incorporation sites (**Table 4**) produces several features that are helpful in On- and Off-target site characterization. These include the following: * Alignment **Pileups**: unique alignments that overlap with each other or "pileup", suggesting a nearby location may be targeted for a double strand break (DSB). For this analyses, any group of `r unique(sapply(configs, "[[", "pileUpMin"))` or more unique alignments were considered as a pileup cluster. * Flanking **Paired** alignments: alignments can be found on either side of a DSB, and therefore identifying flanking alignments suggests a DSB could be found between the paired alignments. Flanking alignments were searched for in these data up to `r 2*unique(sapply(configs, "[[", "upstreamDist"))` bp from each other. -* Target **Matched** alignments: searching for the target sequences upstream of the incorporation site can be an indicator of targeted nuclease activity. While this indicator may seem to be crucial, guide RNAs have been demonstrated to have a variety of behaviors when annealing to target DNA, not all of which can be easily searched for with a simple sequence alignment. Nucleotide sequence matching target sequences were searched for up to `r unique(sapply(configs, "[[", "upstreamDist"))` bp upstream of the incorporation sites and required to have no more than `r unique(sapply(configs, "[[", "maxTargetMismatch"))` mismatches. +* Target **Matched** alignments: searching for the target sequences upstream of the incorporation site can be an indicator of targeted nuclease activity. While this indicator may seem to be crucial, guide RNAs have been demonstrated to have a variety of behaviors when annealing to target DNA, not all of which can be easily searched for with a simple sequence alignment. Nucleotide sequence matching target sequences were searched for up to `r upstream_dist` bp upstream of the incorporation sites and required to have no more than `r unique(sapply(configs, "[[", "maxTargetMismatch"))` mismatches in the target sequence and/or PAM sequence. -Specimen specific tables with data relating to these criteria are found in **Table 3** for percent On-target editing and **Table 4** for identified Off-target loci. +Specimen specific tables with data relating to these criteria are found in **Table 4** for percent On-target editing and **Table `r ifelse(nrow(target_tbl) > 1, 6, 5)`** for identified Off-target loci. ```{r} if( args$format == "pdf" ){ @@ -302,9 +351,9 @@ if( args$format == "pdf" ){ } ``` -## Specimen breakdown +## On-target editing efficiency -**Table 3** displays the percent of cells sampled that were associated with On-target loci for **All** alignments. Further the percentages for **Pileups**, **Paired**, and **Matched** criteria are displayed in the following columns. +**Table 4** displays the percent of observations (efficiency or specificity) that were associated with all On-target loci for **All** alignments. Further the efficiencies for **Pileups**, **Paired**, and **Matched** criteria are displayed in the following columns. These different criteria are used as the denominator to dictcate the amount of observed nuclease-specific editing. This is an estimate though, as On-target editing does have the potential to saturate the dynamic range of the abundance calculation. Therefore, these percentages should be considered lower bounds for editing efficiency and specificity. ```{r on_target_summary} ot_tbl_summary[is.na(ot_tbl_summary)] <- 0 @@ -320,7 +369,7 @@ if( args$tables ){ write.csv( x = ot_tbl_save, - file = file.path(tables_path, "tbl3.ontarget.summary.csv"), + file = file.path(tables_path, "tbl4.ontarget.summary.csv"), quote = TRUE, row.names = FALSE ) @@ -341,7 +390,7 @@ names(ot_tbl_summary) <- c( kable( x = ot_tbl_summary, format = report_format, digits = 4, row.names = FALSE, format.args = list(big.mark = ","), booktabs = TRUE, longtable = TRUE, - escape = FALSE, caption = tbl_caps[3], align = c("l", "l", rep("r", 4)) + escape = FALSE, caption = tbl_caps[4], align = c("l", "l", rep("r", 4)) ) %>% kableExtra::add_header_above( c(" " = 2, "All", "Pileup", "Paired", "Matched") @@ -353,123 +402,7 @@ kable( ``` -```{r} -if( args$format == "pdf" ){ - cat("\\newpage") -}else{ - cat('
') -} -``` - -## Editing near known genomic sites - -**Figure 1** displays the distribution of dsODN incorporations around on-target site(s). Incorporations in different orientations are shown on the positive (red) and negative (blue) y-axis. The percentage in the bottom right corner of each plot is an estimate of the number of incorporations associated with the on-target site (based on pileups) captured within the allowed window of `r unique(sapply(configs, "[[", "upstreamDist"))` bps. These data can be used to fine tune the processing analyses, specifically the `upstreamDist` parameter which modifies the distance upstream of incorporation sites to search for nuclease edited sequences. - -```{r} -incorp_len <- ifelse( - nrow(eval_data$spec_info$supp_data) > 0, - length(unique(paste(on_tar_dists$condition, on_tar_dists$target))), - length(unique(on_tar_dists$target)) -) -``` - -```{r incorp_dist} -incorp_plot <- ggplot(on_tar_dists, aes(x = edit.site.dist, y = strand.cnt)) + - geom_vline(xintercept = 0, color = "black", linetype = "dotted") + - geom_col(aes(fill = factor(strand)), width = 1) + - geom_text( - data = sites_included, - aes(x = x_pos, y = y_pos, label = prop), - hjust = 1, fontface = "bold", size = 5) + - coord_cartesian(xlim = c(-upstream_dist, upstream_dist)) + - scale_y_continuous(breaks = pretty_breaks(), labels = pNums) + - scale_fill_brewer(type = "qual", palette = "Set1") + - guides(fill = FALSE) + - labs( - x = "Distance to Edit Site (bp, res = 1)", - y = "Log Alignment Count") + - custom_theme + - theme( - strip.text.y = element_text(angle = 0), - aspect.ratio = 7/12 - ) - - -null <- lapply( - unique(ceiling(seq_len(incorp_len) / 2)), - function(i, ic_plot, fp, supp_present){ - - if( supp_present ){ - - p <- ic_plot + - facet_wrap_paginate( - condition ~ target, ncol = 2, nrow = 1, page = i, scales = "free" - ) - - }else{ - - p <- ic_plot + - facet_wrap_paginate( - ~ target, ncol = 2, nrow = 1, page = i, scales = "free" - ) - - } - - - file_pdf <- sprintf("incorp_dist-%s.pdf", i) - file_png <- sprintf("incorp_dist-%s.png", i) - - ggsave( - filename = file_pdf, - plot = p, - device = "pdf", - path = fp, - width = figure_width, - height = 3, - units = "in" - ) - - ggsave( - filename = file_png, - plot = p, - device = "png", - path = fp, - width = figure_width, - height = 3, - units = "in", - dpi = knitr::opts_chunk$get("dpi") - ) - - }, - ic_plot = incorp_plot, - fp = figure_path, - supp_present = nrow(eval_data$spec_info$supp_data) > 0 -) - -if( args$format == "pdf"){ - - knitr::include_graphics( - path = sprintf( - file.path(figure_path, "incorp_dist-%s.pdf"), - unique(ceiling(seq_len(incorp_len) / 2)) - ) - ) - -}else{ - - knitr::include_graphics( - path = sprintf( - file.path(figure_path, "incorp_dist-%s.png"), - unique(ceiling(seq_len(incorp_len) / 2)) - ) - ) - -} - -``` - -```{r} -cat(fig_caps[1]) +```{r target_spec_eff, eval=any(sapply(treatment, length) > 1), child=file.path(root_dir, "tools/rscripts/report_templates/target_specific_eff_template.Rmd")} ``` ```{r} @@ -481,9 +414,9 @@ if( args$format == "pdf" ){ ``` # Off-target analysis -## Specimen breakdown +## Specimen information -Using the criteria discussed previously based on characterizing features of nuclease targeted sites, off-target sites can be selected from the data in an unbiased manner. **Table 4** shows a summary of the unique off-target locations (loci) observed in the data. For **All** alignments, the loci are based on overlapping alignments (pileup cluster), without a minimum number of fragments required to be classified as a pileup cluster. **Pileup** loci are similarly based on overlapping alignments, but require at least `r unique(sapply(configs, "[[", "pileUpMin"))` alignments to form a cluster. Flanking **Paired** loci require at least two unique alignments with opposite orientation (strands). Target **Matched** loci require a match in the upstream sequence to a treated target (within `r unique(sapply(configs, "[[", "maxTargetMismatch"))` mismatches out of the `r unique(nchar(target_tbl$Sequence))` nts`r if(any(sapply(nuc_profiles, "[[", "PAM") != FALSE)){paste0(" and ", max(sapply(nuc_profiles, function(x){if(x$PAM != FALSE){x$PAM_Tol}else{0}})), " PAM mismatch")}`). +Using the criteria discussed previously based on characterizing features of nuclease targeted sites, off-target sites can be selected from the data in an unbiased manner. **Table `r ifelse(nrow(target_tbl) > 1, 6, 5)`** shows a summary of the unique off-target locations (loci) observed in the data. For **All** alignments, the loci are based on overlapping alignments (pileup clustering) without a minimum number of fragments required to be classified as a pileup cluster. **Pileup** loci are similarly based on overlapping alignments, but require at least `r unique(sapply(configs, "[[", "pileUpMin"))` alignments to form a cluster. Flanking **Paired** loci require at least two unique alignments with opposite orientation (strands) within `r pNums(2*upstream_dist, digits = 0)` bp upstream of each other. Target **Matched** loci require a match in the upstream sequence to a treated target (within `r unique(sapply(configs, "[[", "maxTargetMismatch"))` mismatches out of the `r unique(nchar(target_tbl$Sequence))` nts`r if(any(sapply(nuc_profiles, "[[", "PAM") != FALSE)){paste0(" and ", max(sapply(nuc_profiles, function(x){if(x$PAM != FALSE){x$PAM_Tol}else{0}})), " PAM mismatch")}`). ```{r off_target_summary} ft_tbl_summary[is.na(ft_tbl_summary)] <- 0 @@ -494,6 +427,8 @@ names(ft_tbl_summary) <- c( if( args$tables ){ + tbl_num <- ifelse(nrow(target_tbl) > 1, 6, 5) + ft_tbl_save <- ft_tbl_summary names(ft_tbl_save) <- c( @@ -503,7 +438,9 @@ if( args$tables ){ write.csv( x = ft_tbl_save, - file = file.path(tables_path, "tbl4.offtarget.summary.csv"), + file = file.path( + tables_path, paste0("tbl", tbl_num, ".offtarget.summary.csv") + ), quote = TRUE, row.names = FALSE ) } @@ -511,7 +448,7 @@ if( args$tables ){ kable( x = ft_tbl_summary, format = report_format, digits = 1, row.names = FALSE, format.args = list(big.mark = ","), booktabs = TRUE, longtable = TRUE, - escape = FALSE, caption = tbl_caps[4] + escape = FALSE, caption = tbl_caps[5] ) %>% kableExtra::add_header_above( c(" " = 2, "All", "Pileup", "Paired", "Matched") @@ -533,7 +470,7 @@ if( args$format == "pdf" ){ ## Off-target enrichment in cancer-associated genes -Flanking **Paired** loci and Target **Matched** loci are tested for enrichment against specific gene lists in **Table 5**. The cancer-associated and special gene lists (adjusted in the config file) included in this analysis were: `r paste(unique(c(sapply(configs, function(x) x$oncoGeneList$file), sapply(configs, function(x) x$specialGeneList$file))), collapse = " and ")`. Enrichment was tested by Fisher's Exact and p-values were adjusted for multiple comparisons using a Benjamani-Hochberg correction. Omitted specimens or conditions had insufficient data for this analysis (Total Gene Count = 0). +Flanking **Paired** loci and Target **Matched** loci are tested for enrichment against specific gene lists in **Table `r ifelse(nrow(target_tbl) > 1, 7, 6)`**. The cancer-associated and special gene lists included in this analysis were: `r paste(unique(c(sapply(configs, function(x) x$oncoGeneList$file), sapply(configs, function(x) x$specialGeneList$file))), collapse = " and ")`. Enrichment was tested by Fisher's Exact and p-values were adjusted for multiple comparisons using a Benjamani-Hochberg correction. Omitted specimens or conditions had insufficient data for this analysis (Total Gene Count = 0) or did not have enough data to support a powerful analysis (Estimated Power greater than 80%). ```{r onco_enrichment} enrich_df <- enrich_df %>% @@ -544,6 +481,8 @@ enrich_df <- enrich_df %>% if( args$tables ){ + tbl_num <- ifelse(nrow(target_tbl) > 1, 7, 6) + enrich_save <- enrich_df names(enrich_save) <- c( @@ -553,44 +492,62 @@ if( args$tables ){ write.csv( x = enrich_save, - file = file.path(tables_path, "tbl5.gene.enrichment.csv"), + file = file.path( + tables_path, paste0("tbl", tbl_num, ".gene.enrichment.csv") + ), quote = TRUE, row.names = FALSE ) } -enrich_df <- enrich_df %>% +enrich_print <- enrich_df %>% dplyr::mutate( + onco.power = round(onco.power * 100, digits = 0), onco.is.sig = onco.p.value <= 0.05, - onco.p.value = sprintf("%.3f", onco.p.value), + onco.p.value = sprintf("%1$.3f (%2$i%%)", onco.p.value, onco.power), onco.p.value = kableExtra::cell_spec( onco.p.value, format = report_format, bold = onco.is.sig ), + special.power = round(special.power * 100, digits = 0), special.is.sig = special.p.value <= 0.05, - special.p.value = sprintf("%.3f", special.p.value), + special.p.value = sprintf( + "%1$.3f (%2$i%%)", special.p.value, special.power + ), special.p.value = kableExtra::cell_spec( special.p.value, format = report_format, bold = special.is.sig ) ) %>% - select(-onco.is.sig, -special.is.sig) + dplyr::filter( + origin == "Reference" | onco.power >= 80 | special.power >= 80 + ) %>% + dplyr::select(-onco.is.sig, -special.is.sig, -onco.power, -special.power) -names(enrich_df) <- c( - "Origin", "Condition", "genes", "genes", "p-value", "genes", "p-value" +names(enrich_print) <- c( + "Origin", "Condition", "genes", "genes", "p-value (pwr)", + "genes", "p-value (pwr)" ) -kable( - x = enrich_df, format = report_format, digits = 4, row.names = FALSE, - format.args = list(big.mark = ","), booktabs = TRUE, longtable = TRUE, - escape = FALSE, caption = tbl_caps[5], - align = c("l", "l", rep("r", 5)) - ) %>% - kableExtra::add_header_above( - c(" " = 2, "Total", "Onco Enrich." = 2, "Special Enrich." = 2) - ) %>% - kableExtra::kable_styling( - bootstrap_options = bootstrap_options, - latex_options = latex_options - ) +enrich_print[1,c(5,7)] <- "" + +if( nrow(enrich_print) > 1 ){ + + kable( + x = enrich_print, format = report_format, row.names = FALSE, + format.args = list(big.mark = ","), booktabs = TRUE, longtable = TRUE, + escape = FALSE, caption = tbl_caps[6], + align = c("l", "l", rep("r", 5)) + ) %>% + kableExtra::add_header_above( + c(" " = 2, "Total", "Onco Enrich." = 2, "Special Enrich." = 2) + ) %>% + kableExtra::kable_styling( + bootstrap_options = bootstrap_options, + latex_options = latex_options + ) + +}else{ + cat("**All comparisons were omitted due to lack of sufficient data.**") +} ``` @@ -684,7 +641,7 @@ if( args$format == "pdf"){ ``` ```{r} -cat(fig_caps[2]) +cat(fig_caps[1]) ``` ```{r} @@ -697,7 +654,7 @@ if( args$format == "pdf" ){ ## Off-target sequence comparison -Off-target sites can be identified by sequence similarity within `r format(upstream_dist, big.mark = ",")` bp upstream of incorporation sites. The sequences of the target matched sites are displayed below in Figure 3 along with the number of mismatches to the targeting sequence (`Mismatch`), an indication if the site is associated with an on- or off-target location (`Target`), the total number of unique alignments associated with the site (`Abund.`), the maximum edit site likelyhood (`MESL`), and an identifier denoted by the nearest gene (`Gene_ID`). MESL is a score for the percentage likelyhood the off-target site is associated with directed nuclease editing, based solely on the respective On-target incorporation distribution. The gene name within the `Gene_ID` is the nearest gene to the genomic location. Further, symbols after the gene name indicate: `*` that the site is within the transcription unit of the gene, `~` the gene appears on the cancer-association list, `!` and that the gene appears on the special gene list. For this report, gene lists used were: `r paste(unique(c(sapply(configs, function(x) x$oncoGeneList$file), sapply(configs, function(x) x$specialGeneList$file))), collapse = " and ")`. +Off-target sites can be identified by sequence similarity within `r format(upstream_dist, big.mark = ",")` bp upstream of incorporation sites. The sequences of the top target matched sites are displayed below in Figure 3 for each condition and target along with the number of mismatches to the targeting sequence (`Mismatch`), an indication if the site is associated with an On- or Off-target location (`Target`), the total number of unique alignments associated with the site (`Abund.`), the maximum edit site likelyhood (`MESL`), and an identifier denoted by the nearest gene (`Gene_ID`). MESL is a score for the percentage likelyhood the off-target site is associated with directed nuclease editing, based solely on the respective On-target incorporation distribution. The gene name within the `Gene_ID` is the nearest gene to the genomic location. Further, symbols after the gene name indicate: `*` that the site is within the transcription unit of the gene, `~` the gene appears on the cancer-association list, `!` and that the gene appears on the special gene list. For this report, gene lists used were: `r paste(unique(c(sapply(configs, function(x) x$oncoGeneList$file), sapply(configs, function(x) x$specialGeneList$file))), collapse = " and ")`. For a complete list of predicted Off-target sites identified by the analysis, please see the Supporting material section, Off-target site tables. ```{r off_target_seqs} ft_seq_plots <- lapply( @@ -816,6 +773,127 @@ if( args$format == "pdf"){ ``` +```{r} +cat(fig_caps[2]) +``` + +```{r} +if( args$format == "pdf" ){ + cat("\\newpage") +}else{ + cat('
') +} +``` + +# Supporting material + +## Editing distributions at On-target loci + +**Figure 3** displays the distribution of dsODN incorporations around On-target site(s). Incorporations in different orientations are shown on the positive (red) and negative (blue) y-axis. The percentage in the bottom right corner of each plot is an estimate of the number of incorporations associated with the on-target site (based on pileups) captured within the allowed window of `r unique(sapply(configs, "[[", "upstreamDist"))` bps. These data can be used to fine tune the processing analyses, specifically the `upstreamDist` parameter which modifies the distance upstream of incorporation sites to search for nuclease edited sequences. + +```{r} +incorp_len <- ifelse( + nrow(eval_data$spec_info$supp_data) > 0, + length(unique(paste(on_tar_dists$condition, on_tar_dists$target))), + length(unique(on_tar_dists$target)) +) +``` + +```{r incorp_dist} +incorp_plot <- ggplot(on_tar_dists, aes(x = edit.site.dist, y = strand.cnt)) + + geom_vline(xintercept = 0, color = "black", linetype = "dotted") + + geom_col(aes(fill = factor(strand)), width = 1) + + geom_text( + data = sites_included, + aes(x = x_pos, y = y_pos, label = prop), + hjust = 1, fontface = "bold", size = 5) + + coord_cartesian(xlim = c(-upstream_dist, upstream_dist)) + + scale_y_continuous(breaks = pretty_breaks(), labels = pNums) + + scale_fill_brewer(type = "qual", palette = "Set1") + + guides(fill = FALSE) + + labs( + x = "Distance to Edit Site (bp, res = 1)", + y = "Log Alignment Count") + + custom_theme + + theme( + strip.text.y = element_text(angle = 0), + aspect.ratio = 7/12 + ) + + +null <- lapply( + unique(ceiling(seq_len(incorp_len) / 2)), + function(i, ic_plot, fp, supp_present){ + + if( supp_present ){ + + p <- ic_plot + + facet_wrap_paginate( + condition ~ target, ncol = 2, nrow = 1, page = i, scales = "free" + ) + + }else{ + + p <- ic_plot + + facet_wrap_paginate( + ~ target, ncol = 2, nrow = 1, page = i, scales = "free" + ) + + } + + + file_pdf <- sprintf("incorp_dist-%s.pdf", i) + file_png <- sprintf("incorp_dist-%s.png", i) + + ggsave( + filename = file_pdf, + plot = p, + device = "pdf", + path = fp, + width = figure_width, + height = 3, + units = "in" + ) + + ggsave( + filename = file_png, + plot = p, + device = "png", + path = fp, + width = figure_width, + height = 3, + units = "in", + dpi = knitr::opts_chunk$get("dpi") + ) + + }, + ic_plot = incorp_plot, + fp = figure_path, + supp_present = nrow(eval_data$spec_info$supp_data) > 0 +) + +if( args$format == "pdf"){ + + knitr::include_graphics( + path = sprintf( + file.path(figure_path, "incorp_dist-%s.pdf"), + unique(ceiling(seq_len(incorp_len) / 2)) + ) + ) + +}else{ + + knitr::include_graphics( + path = sprintf( + file.path(figure_path, "incorp_dist-%s.png"), + unique(ceiling(seq_len(incorp_len) / 2)) + ) + ) + +} + +``` + ```{r} cat(fig_caps[3]) ``` @@ -830,20 +908,51 @@ if( args$format == "pdf" ){ ## Off-target site tables +Below is a complete list of predicted Off-target sites identified for each condition and target. The additional column `Edit Site` (compared to Figure 3) indicates the predicted genomic editing location, and is broken into three parts delimited by ":". The first part indicates the chromosome, the second the orientation with respect to genomic numbering (sense "+" or anti-sense "-"), and the third is the location of the targeted base pair. + ```{r off_target_tbls} +full_target_seqs <- structure( + sapply(seq_len(nrow(target_tbl)), function(i){ + + nuc <- target_tbl$Nuclease[i] + sequence <- target_tbl$Sequence[i] + + ifelse( + nuc_profiles[[nuc]]$PAM_Loc == "3p", + paste0(sequence, nuc_profiles[[nuc]]$PAM), + ifelse( + nuc_profiles[[nuc]]$PAM_Loc == "5p", + paste0(nuc_profiles[[nuc]]$PAM, sequence), + sequence + ) + ) + + }), + names = target_tbl$`Target Name` +) + +last_tbl_num <- ifelse(nrow(target_tbl) > 1, 7, 6) + for( i in seq_along(ft_seqs_list) ){ cap <- paste0("Off-Target Loci: ", names(ft_seqs_list)[i]) - if( report_format == "html" ) cap <- paste0("Table ", i+5, ". ", cap) - + if( report_format == "html" ){ + cap <- paste0("Table ", i + last_tbl_num, ". ", cap) + } + + target_ref_seq <- full_target_seqs[unique(ft_seqs_list[[i]]$target.seq)] + df <- dplyr::select( ft_seqs_list[[i]], target, gene_id, edit.site, aligns, MESL, aligned.sequence, mismatch ) %>% dplyr::mutate( - aligned.sequence = divSeq(aligned.sequence, aligned.sequence[1]), + aligned.sequence = divSeq( + seqs = aligned.sequence, + ref = target_ref_seq + ), MESL = round(MESL, digits = 1) ) @@ -856,7 +965,9 @@ for( i in seq_along(ft_seqs_list) ){ "Mismatch" = mismatch )%>% write.csv( - file = file.path(tables_path, paste0("tbl", i+5, ".offtargets.csv")), + file = file.path( + tables_path, paste0("tbl", i + last_tbl_num, ".offtargets.csv") + ), quote = TRUE, row.names = FALSE ) diff --git a/tools/rscripts/report_templates/target_specific_eff_template.Rmd b/tools/rscripts/report_templates/target_specific_eff_template.Rmd new file mode 100644 index 00000000..3a84764c --- /dev/null +++ b/tools/rscripts/report_templates/target_specific_eff_template.Rmd @@ -0,0 +1,46 @@ +## Target-specific On-target editing efficiency + +The table below displays the estimated On-target editing efficiency for each target across the specimens tested using all target **Matched** observations. + +```{r on_target_efficiency} +names(ot_eff_summary)[c(1,2)] <- c("Specimen", "Condition") + +ot_eff_data <- ot_eff_summary %>% + dplyr::select(-Specimen, -Condition) %>% + dplyr::mutate_all( + pNums, + na.replace = ifelse(report_format == "latex", "-", "\\-"), + digits = 3 + ) + +ot_eff_print <- dplyr::bind_cols(ot_eff_summary[,c(1,2)], ot_eff_data) + +if( args$tables ){ + + ot_eff_save <- ot_eff_summary + + write.csv( + x = ot_eff_save, + file = file.path(tables_path, "tbl5.ontarget.efficiency.csv"), + quote = TRUE, row.names = FALSE + ) + +} + +tbl_caption <- "Estimate of On-target editing efficiency (percent) for each target by specimen." + +if( args$format == "html" ){ + tbl_caption <- paste0("Table 5. ", tbl_caption) +} + +kable( + x = ot_eff_print, format = report_format, row.names = FALSE, + booktabs = TRUE, longtable = TRUE, escape = TRUE, caption = tbl_caption, + align = c("l", "l", rep("r", ncol(ot_eff_print)-2)) +) %>% + kableExtra::kable_styling( + bootstrap_options = bootstrap_options, + latex_options = latex_options + ) + +``` diff --git a/tools/rscripts/supporting_scripts/processBLATData.R b/tools/rscripts/supporting_scripts/processBLATData.R index 645e3bf2..d215aba5 100644 --- a/tools/rscripts/supporting_scripts/processBLATData.R +++ b/tools/rscripts/supporting_scripts/processBLATData.R @@ -17,12 +17,12 @@ processBLATData <- function(algns, from, ref.genome){ algns$qtStart <- ifelse( algns$strand == "+", ( algns$tStart - (algns$qStart) ), - ( algns$tStart - (algns$qSize - algns$qEnd - 1) ) + ( algns$tStart - (algns$qSize - algns$qEnd) ) ) algns$qtEnd <- ifelse( algns$strand == "+", - ( algns$tEnd + (algns$qSize - algns$qEnd - 1) ), + ( algns$tEnd + (algns$qSize - algns$qEnd) ), ( algns$tEnd + (algns$qStart) ) )