diff --git a/Dockerfile b/Dockerfile index f5166e3..c0ec625 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,3 @@ -################################################################################ -##################### Add Container Labels ##################################### -LABEL "Regtools_License"="MIT" -LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\ - to help interpret mutations in a regulatory and splicing\ - context." - ################################################################################ ##################### Set Inital Image to work from ############################ @@ -34,6 +27,13 @@ RUN apt-get update -y && apt-get install -y \ cmake \ python3 +################################################################################ +##################### Add Container Labels ##################################### +LABEL "Regtools_License"="MIT" +LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\ + to help interpret mutations in a regulatory and splicing\ + context." + ################################################################################ ####################### Install R ############################################## @@ -55,18 +55,21 @@ RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse"), repos ##################### Install Regtools ######################################### # clone git repository -RUN git clone https://github.com/griffithlab/regtools.git +RUN cd / && git clone https://github.com/griffithlab/regtools.git # make a build directory for regtools WORKDIR /regtools/ -RUN mkdir build + # compile from source -RUN cd /regtools/build && cmake .. -RUN cd /regtools/build && make +RUN mkdir build && cd build && cmake .. && make ################################################################################ ###################### set environment path ################################# +# make a build directory for regtools +WORKDIR /regtools/scripts/ + # add regtools executable to path -ENV PATH="/regtools/build:${PATH}" +ENV PATH="/regtools/build:/usr/local/bin/R-${r_version}:${PATH}" + diff --git a/docs/workflow.md b/docs/workflow.md index 7cca4d5..abb23c7 100644 --- a/docs/workflow.md +++ b/docs/workflow.md @@ -2,9 +2,10 @@ This is an example workflow for running RegTools on a cohort of samples. This analysis requires that there be a vcf and RNA bam/cram file for each samples. The outline described below was used to run our own analysis on TCGA data. -By the end of the analysis, the directory structure should look like so: +By the end of the analysis, the directory structure should look like the example below. The `*` in the example below refers to the tag/parameter used to run `regtools cis-splice-effects identify` with. -- Project/ (SCLC/) +```bash +- Project/ - all_splicing_variants*.bed - paths.tsv - make_vcfs.sh @@ -23,64 +24,82 @@ By the end of the analysis, the directory structure should look like so: - cse_identify_filtered_* - cse_identify_filtered_compare_* - variants*.bed + - Sample_2/ + - tumor_rna_alignments.bam + - tumor_rna_alignments.bam.bai + - variants.per_gene.vep.vcf.gz + - variants.per_gene.vep.vcf.gz.tbi + - variants.ensembl + - logs/ + - output/ + - cse_identify_filtered_* + - cse_identify_filtered_compare_* + - variants*.bed - compare_junctions/ - hist/ - junction_pvalues_*.tsv +``` + +### Set tag and parameter shell arguments + +```bash +tag= +param= +# (e.g. tag=default param=""; tag=E param="-E"; tag=i20e5 param="-i 20 -e 5") +``` -## Set tag and parameter shell arguments +### Run `regtools cis-splice-effects identify` with desired options for selecting variant and window size -tag= param= (e.g. tag=default param=""; tag=E param="-E"; tag=i20e5 param="-i 20 -e 5") +```bash +for i in samples/*/; do regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_$tag.tsv -j ${i}/output/cse_identify_filtered_$tag.bed -v ${i}/output/cse_identify_filtered_$tag.vcf ${i}/variants.per_gene.vep.vcf.gz ${i}/tumor_rna_alignments.bam /reference.fa reference.gtf; done +``` -# run regtools cse identify with desired options for selecting variant and window size -for i in samples/*/; do bsub -oo $i/logs/regtools_actual_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_$tag.tsv -j ${i}/output/cse_identify_filtered_$tag.bed -v ${i}/output/cse_identify_filtered_$tag.vcf ${i}/variants.per_gene.vep.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done +### Make `variant.bed` for each sample -# make variant.bed -for i in samples/*/; do bsub -oo $i/logs/make_variant_bed_$tag.lsf bash /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/variants.sh ${i}/output/cse_identify_filtered_$tag.tsv ${i}/output/variants_$tag.bed; done +```bash +for i in samples/*/; do bash variants.sh ${i}/output/cse_identify_filtered_$tag.tsv ${i}/output/variants_$tag.bed; done +``` -# make bed (really just tsv with columns: chrom start end samples) with all variants that were deemed significant to splicing across all samples +### Combine each sample's `variant.bed` file per tag to get all variants that were deemed significant to splicing across all samples for a given tag + +```bash echo -e 'chrom\tstart\tend\tsamples' > all_splicing_variants_$tag.bed for i in samples/*/; do j=${i##samples/}; uniq ${i}output/variants_$tag.bed | awk -v var=${j%%/} '{print $0 "\t" var}' >> all_splicing_variants_$tag.bed; done +``` + +### Make vcf of all variants across all samples (from each sample's variants.vcf). Then, compress it and index it + +```bash +vcf-concat samples/*/variants.vcf.gz | vcf-sort > all_variants_sorted.vcf -# make vcf of all variants across all samples (from variants.vcf) -vcf-concat samples/*/variants.per_gene.vep.vcf.gz | vcf-sort > all_variants_sorted.vcf -bgzip all_variants_sorted.vcf -tabix all_variants_sorted.vcf.gz +###### optional ###### +bgzip all_variants_sorted.vcf -# run cis-splice effects identify on all samples with all variants (with $tag options as example) -for i in samples/*/; do bsub -oo $i/logs/regtools_compare_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_compare_$tag.tsv -j ${i}/output/cse_identify_filtered_compare_$tag.bed -v ${i}/output/cse_identify_filtered_compare_$tag.vcf variants_all_sorted.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done +tabix all_variants_sorted.vcf.gz +``` -<===== JUNCTION COMPARISON ANALYSIS =====> +### Run `regtools cis-splice effects identify` on all samples with all variants (with `$tag` options as example) -# make directories -mkdir compare_junctions/ -mkdir compare_junctions/outlier -mkdir compare_junctions/ratio +```bash +for i in samples/*/; do bsub -oo $i/logs/regtools_compare_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_compare_$tag.tsv -j ${i}/output/cse_identify_filtered_compare_$tag.bed -v ${i}/output/cse_identify_filtered_compare_$tag.vcf all_variants_sorted.vcf.gz ${i}/tumor_rna_alignments.bam reference.fa reference.gtf; done +``` -# outlier analysis -bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_outlier.R $tag +## Beginning of compare junctions analysis -# ratio analysis -bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_ratio.R $tag +### Make directory to store comparison results -# vep comparison (outlier) -bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R outlier $tag +```bash +mkdir -p compare_junctions/hist +``` -# vep comparison (ratio) -bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R ratio $tag +### Run `compare_junctions_hist.R` on sample data -NOTE: in the vep comparison, since regtools doesn't really have a concept of "variants" per se but rather "variant positions" (doesn't care about the actual alleles), we are really counting (positions of variants found splicing-significant by vep AND found significant by regtools) / (positions of variants found significant by found significant by regtools) +```bash +Rscript --vanilla compare_junctions_hist.R +``` -To count: +### Run `filter_and_BH.R` to adjust p values and filter results -echo -e "anchor\tvep_significant_vars\ttotal_significant_vars\tpercent_vep_significant" > comparison_counts.tsv -for i in default i20e5 i50e5 E I; do - echo -n $i >> comparison_counts.tsv; - echo -en "\t" >> comparison_counts.tsv; - vep=$(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | grep "TRUE" | wc -l) - echo -n $vep >> comparison_counts.tsv; - echo -en "\t" >> comparison_counts.tsv; - total=$(($(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | wc -l)-1)) - echo -n $total >> comparison_counts.tsv; - echo -en "\t" >> comparison_counts.tsv; - echo $(bc -l <<< "$vep/$total") >> comparison_counts.tsv; -done +```bash +Rscript --vanilla filter_and_BH.R +``` diff --git a/scripts/compare_junctions_hist_v2.R b/scripts/compare_junctions_hist_v2.R index 0f9342a..4a2e27b 100644 --- a/scripts/compare_junctions_hist_v2.R +++ b/scripts/compare_junctions_hist_v2.R @@ -9,21 +9,21 @@ library(tidyverse) debug = F -system.time({ -if (debug){ - tag = paste("_", "default", sep="") -} else { - # get options tag - args = commandArgs(trailingOnly = TRUE) - tag = args[1] - input_file = args[2] - if ( substr(tag, 2, 3) == "--"){ - stop("Please specify an option tag (e.g. \"default\", \"i20e5\")") - } -} +# system.time({ +# if (debug){ +# tag = paste("_", "default", sep="") +# } else { +# # get options tag +# args = commandArgs(trailingOnly = TRUE) +# tag = args[1] +# input_file = args[2] +# if ( substr(tag, 2, 3) == "--"){ +# stop("Please specify an option tag (e.g. \"default\", \"i20e5\")") +# } +# } -# tag = 'I' -# input_file = '~/Desktop/CHOL/all_splicing_variants_I.bed' +tag = 'E' +input_file = 'all_splicing_variants_E.bed' # All splicing relevant variants (union of rows from variants.bed files; add column with comma-separated list of sample names) all_splicing_variants = unique(data.table::fread(input_file), sep = '\t', header = T, stringsAsFactors = FALSE) @@ -150,21 +150,14 @@ regtools_data = subset(regtools_data, select=columns_to_keep) # zeroes need to be added in for some samples -a <- function(x, y, z){ +a <- function(x, y){ toAdd <- y - length(x) - 1 # browser() toAdd <- rep(0.0000000, toAdd) x <- c(x, toAdd) return(x) } -x <- mapply(a, regtools_data$norm_scores_non, length(all_samples), regtools_data$samples) - - -# if (typeof(x) == 'list') { -# x <- matrix(pad(unlist(x), ncols),nrow = rows, byrow = TRUE, ncol = cols) -# x <- t(x) -# } -# browser() +x <- mapply(a, regtools_data$norm_scores_non, length(all_samples)) get_num_zeros_to_rm <- function(z){ num_zeroes_to_rm = str_count(z, ',') @@ -188,6 +181,47 @@ if (max(num_zeroes_to_rm > 0)) { x <- mapply(rm_zeroes, regtools_data$norm_scores_non, regtools_data$zeroes_to_rm) regtools_data$norm_scores_non = x } + +get_mean <- function(x){ + x <- mean(as.numeric(x)) + return(x) +} + +x <- mapply(get_mean, regtools_data$norm_scores_non) +regtools_data$mean_norm_score_non <- x + +get_sd <- function(x){ + x <- sd(as.numeric(x)) + return(x) +} + +x <- mapply(get_sd, regtools_data$norm_scores_non) +regtools_data$sd_norm_score_non <- x + +a <- function(x, y){ + # if(y == "TCGA-ZH-A8Y2-01A,TCGA-ZH-A8Y5-01A"){ + # browser() + # } + toAdd <- (str_count(y, ',') + 1) - (str_count(x, ',') + 1) + # browser() + if (toAdd > 0) { + toAdd <- rep(0.0000000, toAdd) + x <- c(x, toAdd) + } else { + x <- unlist(strsplit(x, ",")) + } + x <- list(x) + return(x) +} +x <- mapply(a, regtools_data$norm_scores_variant, regtools_data$samples) +regtools_data$norm_scores_variant = x + +x <- mapply(get_mean, regtools_data$norm_scores_variant) +regtools_data$mean_norm_score_variant <- x + +x <- mapply(get_sd, regtools_data$norm_scores_variant) +regtools_data$sd_norm_score_variant <- x + print("test7") ################ calculate p-values ############################################ @@ -207,9 +241,6 @@ a <- function(x){ breaks = seq(0.5, max(non_variant_norm_scores_ranked)+1.5, by=1), plot=F) mids = histinfo$mids cd = cumsum(histinfo$density) - # if(x$info == "chr1_729955_735423_D_chr1:809966-809967"){ - # browser() - # } underestimate = max(which(mids <= variant_norm_score_ranked)) pvalue = 1-cd[underestimate] return(pvalue) @@ -243,5 +274,5 @@ regtools_data = regtools_data %>% distinct() write.table(regtools_data, file=paste(input_file, "_out.tsv", sep=""), quote=FALSE, sep='\t', row.names = F) - -}) +# +# }) diff --git a/scripts/stats_wrapper.py b/scripts/stats_wrapper.py index d4c740b..cd7529c 100644 --- a/scripts/stats_wrapper.py +++ b/scripts/stats_wrapper.py @@ -36,7 +36,7 @@ files.sort() number_of_in_files = len(files) for file in files: - subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True) + subprocess.run(f'Rscript --vanilla compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True) output_files = glob.glob("*_out.tsv") output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters number_of_out_files = len(output_files)