diff --git a/README.md b/README.md index baff5e3..498d8c2 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,22 @@ -
-

ScanNeo2 (working title)

+-
+

ScanNeo2

+ Workflow status badge
- ## What is ScanNeo2 -`Scanneo2` is a snakemake workflow for the prediction of neoantigens from -multiple sources. In its current state, this includes +`Scanneo2` is a snakemake workflow for the prediction of neoantigens from multiple sources. In its current state, +this includes canonical-splicing, exitron-splicing, gene fusion, indels and snvs. ## Getting Started -In principle, Scanneo2 aims to - +In principle, Scanneo2 aims to resolve its dependencies automatically and requires only snakemake and snakedeploy. ## Quickstart Install `snakemake` and `snakedeploy` ``` -mamba env create --file https:// +mamba env create --file https://github.com/ylab-hi/ScanNeo2/blob/devel/environment.yml mamba activate scanneo2 ``` Deploy Scanneo2 @@ -26,17 +25,19 @@ mkdir -p /path/to/working/directory/ cd /path/to/working/directory/ snakedeploy deploy-workflow https://github.com/ylab-hi/scanneo2 . --tag v0.1.0 ``` +Configure ScanNeo2 by modifying `config/config.yml` + Run the workflow ``` +cd scanneo snakemake --cores all --use-conda ``` - -Please consult the wiki for detailed instruction and explanations. - +Please consult the [wiki](https://github.com/ylab-hi/ScanNeo2/wiki) for detailed instruction and explanations. ### Docker -We also provide a ready to use Docker Container that can be used +We also provide a ready-to-use [Docker Container](https://hub.docker.com/r/yanglabinfo/scanneo2) +that can be used to use Scanneo2. diff --git a/config/config.yaml b/config/config.yaml index 65e1682..da9d3ad 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,11 +1,30 @@ -refgen: testdata/GRCh38.p13.genome.fa -annotation: testdata/GRCh38_chr1.gtf +data: + name: patient2 + dnaseq: + rep1: TESLA_testdata/patient2/WES/TESLA_9_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_9_2.fastq.gz + rep2: TESLA_testdata/patient2/WES/TESLA_10_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_10_2.fastq.gz + rnaseq: + rep1: TESLA_testdata/patient2/RNA/TESLA_11_1.fastq.gz TESLA_testdata/patient2/RNA/TESLA_11_2.fastq.gz + + + +sample: patient2 # naming of the sample (results/sample) dnaseq: rnaseq: LNCAP_bam/G28033.LNCaP_clone_FGC.1.bam -preproc: true +### pre-processing (only applied on fastq reads) +preproc: + activate: true # whether (=true) or not (=false) to include pre-processing + minlen: 10 # discard reads which are less than bases + leading: 3 # remove leading low quality or N bases + trailing: 3 # remove trailing low quality or N bases + slidingwindow: + activate: true # whether (=true) or not (=false) to include pre-processing + windowsize: 3 # number of bases to average across + quality: 20 # the average quality (Phred) required + adapters: TruSeq2-PE.fa # path to fasta file containg adapters to be trimmed # if no readgroups file are provided diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 799764a..6ebeb62 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,61 +1,251 @@ import os -import re import glob +from icecream import ic from pathlib import Path +def data_structure(data): + config['data']['dnaseq'], filetype, readtype = handle_seqfiles(config['data']['dnaseq']) + config['data']['dnaseq_filetype'] = filetype + config['data']['dnaseq_readtype'] = readtype + config['data']['rnaseq'], filetype, readtype = handle_seqfiles(config['data']['rnaseq']) + config['data']['rnaseq_filetype'] = filetype + config['data']['rnaseq_readtype'] = readtype + return config['data'] + + +def handle_seqfiles(seqdata): + readtype = [] + filetype = [] + + # iterate over replicates + for rpl in seqdata.keys(): + files = [Path(file) for file in seqdata[rpl].split(' ')] + ic(files) + if len(files) == 1: # SE + f1_ext = get_file_extension(files[0]) + if f1_ext in ['.fq', '.fastq', '.bam']: + seqdata[rpl] = files[0] + filetype.append(f1_ext) + readtype.append('SE') + else: + print('{} is not a valid file'.format(files[0])) + elif len(files) == 2: # PE + f1_ext = get_file_extension(files[0]) + f2_ext = get_file_extension(files[1]) + # check if file extensions are the same + if f1_ext == f2_ext: + if(valid_paired_end(files[0], files[1])): + seqdata[rpl] = files + filetype.append(f1_ext) + readtype.append('PE') + else: + print('files not in valid PE format') + else: + print('files do not have the same extension') + + # check if filetype and readtype are the same + if all_identical(filetype) and all_identical(readtype): + return seqdata, filetype[0], readtype[0] + else: + print('filetypes are not the same') + return seqdata, None, None + +# pre-processing for RNAseq data +def get_raw_reads(wildcards): + if config['data']['rnaseq_readtype'] == 'SE': + return config['data']['rnaseq'][wildcards.sample] + else: # PE + return dict( + zip( + ["r1", "r2"], + [config['data'][wildcards.seqtype][wildcards.replicate][0], + config['data'][wildcards.seqtype][wildcards.replicate][1]], + ) + ) + +# returns the reads (raw/preprocessed) for a given sample +def get_reads(wildcards): + if config['preproc']['activate']: + if config['data'][wildcards.seqtype+'_readtype'] == 'SE': + return config['data'][wildcards.seqtype][wildcards.replicate] + elif config['data'][wildcards.seqtype+'_readtype'] == 'PE': + print("yes") + return {"r1": "results/{sample}/{seqtype}/reads/{replicate}_preproc_r1.fq.gz", + "r2": "results/{sample}/{seqtype}/reads/{replicate}_preproc_r2.fq.gz"} + + +# determines the file extension for a given file - excludes .gz +def get_file_extension(path): + filename = path.name + extpat = r'\.(fastq|fq|bam)(\.gz)?$' + res = re.search(extpat, filename) + file_ext = '' + if res is not None: + if res.group(0).endswith('.gz'): + file_ext = filename[res.start():-3] + else: + file_ext = filename[res.start():] + return file_ext + # check if files are a valid paired-end pair -def valid_paired_end(file1, file2): - # check if both files are in FASTA format - if Path(file1).suffix not in ['.fastq', '.fq'] and Path(file2).suffix not in ['.fastq', '.fq']: - return False - - # check if first file contains _R1 or _fwd - if not ("_R1" in file1 or "_fwd" in file1): - return False - # check if second file contains _R2 or _rev - if not ("_R2" in file2 or "_rev" in file2): - return False - - # check if the substrings until occurrence of either _R1/2 or _fwd/rev are equal - file1_indicator = "_R1" if "_R1" in file1 else "_fwd" - file2_indicator = "_R2" if "_R2" in file2 else "_rev" - file1_idx = file1.find(file1_indicator) - file2_idx = file2.find(file2_indicator) - - if file1_idx != file2_idx: - return False - - if file1[:file1_idx] != file2[:file2_idx]: - return False +def valid_paired_end(path1, path2): + valid = False + + # only consider filename + file1 = path1.name + file2 = path2.name - # check if first file contains _R1 and the secon file contaif "_R1" in string1 and "_R2" not in string2: - return False - - # check if first file contains _fwd and the secon file contains _rev - if "_fwd" in string1 and "_rev" not in string2: - return False + # check if first file contains _R1, _1 or _fwd + pattern = r'\_(R1|R2|1|2|fwd|rev)\.(fastq|fq){1}(\.gz)?$' + f1_se = re.search(pattern, file1) + f2_se = re.search(pattern, file1) + + # patterns needs to be found in both files + if f1_se is not None and f2_se is not None: + if file1[:f1_se.start()] == file2[:f2_se.start()]: + valid = True + else: + print('{} and {} have different filestem '.format(file1, file2)) + else: + print('{} and {} are not valid PE files'.format(file1, file2)) + + return valid +# check if files in list are identical +def all_identical(l): + if l.count(l[0]) == len(l): return True + else: + return False -rnaseq_input = {} -rnaseq_filetype = None -rnaseq_files = config['rnaseq'].split(' ') -if len(rnaseq_files) == 1: - if Path(rnaseq_files[0]).suffix in ['.fq', '.fastq', '.bam']: - rnaseq_filetype = Path(rnaseq_files[0]).suffix # store file extension - rnaseq_input[Path(rnaseq_files[0]).stem] = rnaseq_files[0] - else: - print("no rnaseq files found") -elif len(rnaseq_files) == 2: - # check if both files are in valid paired-end format - if valid_paired_end(rnaseq_files[0], rnaseq_files[1]): - rnaseq_filetype = Path(rnaseq_files[0]).suffix # store file extension - rnaseq_input.append((rnaseq_files[0], rnaseq_files[1])) - else: - print("no valid paired-end files found") +config['data'] = data_structure(config['data']) +ic(config['data']) + + +rnaseq_filetype = ".bam" + +def get_splitfastq_input(wildcards): + if config['preproc']['activate']: + if config['data'][wildcards.seqtype] == 'SE': + return expand("results/{sample}/{seqtype}/preproc/reads.fq.gz", **wildcards) + elif config['data'][wildcards.seqtype] == 'PE': # PE + return expand("results/{sample}/reads/{seqtype}/{replicate}_preproc_{readtype}.fq.gz", + readtype=["r1", "r2"], + seqtype=wildcards.seqtype, + replicate = wildcards.replicate, + sample = wildcards.sample) + + else: # no pre-processing has been performed + return rnaseq_input[wildcards.sample] + + +def get_splitfastq_input_PE(wildcards): + if config['preproc']['activate']: + return expand("results/{sample}/rnaseq/reads/inputreads_{readtype}.fq.gz", + readtype=["r1", "r2"], + **wildcards + ) + else: # no pre-processing + return rnaseq_input[wildcards.sample] + + + +# input for alignment w/ DNAseq data +def get_align_input_dnaseq(wildcards): + if config['preproc']['activate']: + + + + + if config['data']['dnaseq_readtype'] == 'SE': + return expand("results/{sample}/dnaseq/reads/inputreads.fq.gz", **wildcards) + else: # PE + return dict( + zip( + ["fq1", "fq2"], + expand("results/{sample}/dnaseq/reads/inputreads_{readtype}.fq.gz", + readtype=["r1", "r2"], + **wildcards + ) + ) + ) + else: # no pre-processing + if config['data']['dnaseq_readtype'] == 'SE': + return dnaseq_input[wildcards.sample] + else: # PE + return dict( + zip( + ["fq1", "fq2"], + dnaseq_input[wildcards.sample] + ) + ) + + + +def get_align_input(wildcards): + if config['preproc']['activate']: + if rnaseq_readtype == "se": + return expand("results/{sample}/rnaseq/reads/inputreads.fq.gz", **wildcards) + else: # pe + ic("in here") + return dict( + zip( + ["fq1", "fq2"], + expand("results/{sample}/rnaseq/reads/inputreads_{readtype}.fq.gz", + readtype=["r1", "r2"], + **wildcards + ) + ) + ) + else: # no pre-processing + if rnaseq_readtype == "se": + return rnaseq_input[wildcards.sample] + else: # pe + return dict( + zip( + ["fq1", "fq2"], + rnaseq_input[wildcards.sample] + ) + ) + +# determine the bamfiles that contains readgroups +#def get_bams_readsgroups(wildcards): + +# fq1 = "results/{sample}/rnaseq/reads/{replicate}/r1/reads_{i}.fq.gz", +# aln = "results/{sample}/rnaseq/align/{replicate}/reads_{i}.bam", + + +def get_readgroups_input(wildcards): + # return only bam from STAR align + if config['data']['rnaseq_filetype'] in ['.fq','.fastq']: + return ["results/{sample}/rnaseq/align/{replicate}_ready.bam".format(**wildcards)] + elif config['data']['rnaseq_readtype'] in ['.bam']: + val = [] + val.append(config['data']['rnaseq'][wildcards.replicate][wildcards.sample]) + val.append(extend("results/{sample}/rnaseq/align/{replicate}/ready.bam", + sample=wildcards.sample, + replicate=wildcards.replicate)) + + + +def aggregate_alignments_fastq(wildcards): + # make sure that all samples are processed in checkpoint - split fastq file + checkpoint_output = checkpoints.splitfastq.get(**wildcards).output[0] + return expand("results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.bam", + sample=wildcards.sample, + replicate=wildcards.replicate, + i=glob_wildcards(os.path.join(checkpoint_output, "r1/reads_{i}.fq.gz")).i) + +def aggregate_alignments_pe(wildcards): + # make sure that all samples are processed in checkpoint - split fastq file + checkpoint_output = checkpoints.splitfastq_pe.get(**wildcards).output[0] + return expand("results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.bam", + sample=wildcards.sample, + i=glob_wildcards(os.path.join(checkpoint_output, "inputreads_{i}.fq.gz")).i) + # aggregate results from STAR alignment def aggregate_alignments(wildcards): # make sure that all samples are processed in checkpoint - split fastq file @@ -65,19 +255,16 @@ def aggregate_alignments(wildcards): readgroup=glob_wildcards(os.path.join(checkpoint_output, "{readgroup}.bam")).readgroup) -def aggregate_bwa_alignments(wildcards): - split_bamfiles_output = checkpoints.bamfile_split.get(**wildcards).output[0] - return expand("results/{sample}/rnaseq/align/bamfiles/{file}.bam", - sample=wildcards.sample, - file=glob_wildcards(os.path.join(split_bamfiles_output, "{file}.bam")).file) - - +# getting input starting from align def get_rnaseq_data(wildcards): + print(rnaseq_input) if rnaseq_filetype == ".bam": - print(wildcards) return rnaseq_input[wildcards.sample] elif rnaseq_filetype == ".fastq" or rnaseq_filetype == ".fq": - return rnaseq_input[wildcards.sample] + if config['preproc']['activate']: # preproc activated? + if len(rnaseq_input[wildcards.sample] == 2): # PE? + return expand("results/{sample}/preproc/trimmed/trm.fq.gz", + **wildcards) else: print("no rnaseq data found") diff --git a/workflow/rules/germline.smk b/workflow/rules/germline.smk index 8ad79e9..01b4129 100644 --- a/workflow/rules/germline.smk +++ b/workflow/rules/germline.smk @@ -1,25 +1,6 @@ -rule picard_create_dict: - input: - "resources/refs/genome.fasta" - output: - "resources/refs/genome.dict" - log: - "logs/picard/create_dict.log" - params: - extra="", # optional: extra arguments for picard. - # optional specification of memory usage of the JVM that snakemake will respect with global - # resource restrictions (https://snakemake.readthedocs.io/en/latest/snakefiles/rules.html#resources) - # and which can be used to request RAM during cluster job submission as `{resources.mem_mb}`: - # https://snakemake.readthedocs.io/en/latest/executing/cluster.html#job-properties - resources: - mem_mb=1024, - wrapper: - "v1.31.1/bio/picard/createsequencedictionary" - - # download training sets for calling high confidence variants # see https://gatk.broadinstitute.org/hc/en-us/articles/4402736812443-Which-training-sets-arguments-should-I-use-for-running-VQSR- -rule gatk_vqsr_training_sets: +rule get_gatk_vqsr_training_sets: output: snp_hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz", snp_hapmap_idx="resources/vqsr/hapmap_3.3.hg38.vcf.gz.tbi", @@ -31,7 +12,8 @@ rule gatk_vqsr_training_sets: snp_dbSNP_idx="resources/vqsr/dbSNP_b150.vcf.gz.tbi", indel_mills="resources/vqsr/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", indel_mills_idx="resources/vqsr/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi" - + message: + "Downloading training sets for calling high confidence variants" shell: """ curl -L https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/hapmap_3.3.hg38.vcf.gz -o resources/vqsr/hapmap_3.3.hg38.vcf.gz @@ -47,16 +29,18 @@ rule gatk_vqsr_training_sets: """ # do a first round of variant calling on original, unrecalibrated data -rule htc_first: +rule detect_indels_htc_1rd: input: # single or list of bam files - bam="results/{sample}/rnaseq/align/realigned.bam", + bam="results/{sample}/rnaseq/align/{replicate}_realigned.bam", ref="resources/refs/genome.fasta", known="resources/vqsr/dbSNP_b150.vcf.gz" # optional output: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf" + vcf="results/{sample}/variants/indel/htcaller/{replicate}_variants.1rd.vcf" + message: + "First round of variant calling on original, unrecalibrated data on sample:{wildcards.sample} with replicate:{wildcards.replicate}" log: - "logs/{sample}/gatk/haplotypecaller/1rd.log", + "logs/{sample}/gatk/haplotypecaller/{replicate}_1rd.log", params: extra="", java_opts="", @@ -68,216 +52,223 @@ rule htc_first: # recalibrate variants (SNP) -rule htc_first_snp_recal: - input: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf", - ref="resources/refs/genome.fasta", - hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz", - omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz", - g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz", - dbsnp="resources/vqsr/dbSNP_b150.vcf.gz", - output: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf", - idx="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf.idx", - tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.all.tranches" - log: - "logs/{sample}/gatk/vqsr/recal.first.snp" +#rule recalibrate_variants_first_round: + #input: + #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_variants.1rd.vcf", + #ref="resources/refs/genome.fasta", + #hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz", + #omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz", + #g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz", + #dbsnp="resources/vqsr/dbSNP_b150.vcf.gz", + #output: + #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.recal.vcf", + #idx="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.recal.vcf.idx", + #tranches="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.tranches" + #message: + #"Recalibrate variants (SNP) on sample:{wildcards.sample} with replicate:{wildcards.replicate}" + #log: + #"logs/{sample}/gatk/vqsr/{replicate}_1rd_snv_recal.log" - params: - mode="SNP", - resources={ - "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0}, - "omni": {"known": False, "training": True, "truth": True, "prior": 12.0}, - "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0}, - "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0}, - }, - annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"], - extra="" - threads: config['threads'] - resources: - mem_mb=1024, - wrapper: - "v1.31.1/bio/gatk/variantrecalibrator" + #params: + #mode="SNP", + #resources={ + #"hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0}, + #"omni": {"known": False, "training": True, "truth": True, "prior": 12.0}, + #"g1k": {"known": False, "training": True, "truth": True, "prior": 10.0}, + #"dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0}, + #}, + #annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"], + #extra="" + #threads: config['threads'] + #resources: + #mem_mb=1024, + #wrapper: + #"v1.31.1/bio/gatk/variantrecalibrator" -rule htc_first_snp_apply_vqsr: - input: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf", - recal="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf", - tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.all.tranches", - ref="resources/refs/genome.fasta", - output: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.filt.vcf" - log: - "logs/{sample}/gatk/vqsr/apply.1rd.snp.log" - params: - mode="SNP", # set mode, must be either SNP, INDEL or BOTH - extra="--truth-sensitivity-filter-level 99.5", # optional - resources: - mem_mb=1024, - wrapper: - "v1.31.1/bio/gatk/applyvqsr" +#rule apply_VQSR_SNVs_1rd: + #input: + #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_variants.1rd.vcf", + #recal="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.recal.vcf", + #tranches="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.tranches", + #ref="resources/refs/genome.fasta", + #output: + #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.flt.vcf" + #message: + #"Apply VQSR (SNP) on sample:{wildcards.sample} with replicate:{wildcards.replicate}" + #log: + #"logs/{sample}/gatk/vqsr/{replicate}_apply.1rd.snp.log" + #params: + #mode="SNP", # set mode, must be either SNP, INDEL or BOTH + #extra="--truth-sensitivity-filter-level 99.5", # optional + #resources: + #mem_mb=1024, + #wrapper: + #"v1.31.1/bio/gatk/applyvqsr" -# repeat Variant Quality Score Recalibration for indels -use rule htc_first_snp_recal as htc_first_indel_recal with: - output: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf", - idx="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf.idx", - tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.all.tranches" - log: - "logs/{sample}/gatk/vqsr/recal_first_snp.log" - params: - mode="INDEL", - resources={ - "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0}, - "omni": {"known": False, "training": True, "truth": True, "prior": 12.0}, - "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0}, - "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0}, - }, - annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"], - extra="" +### repeat Variant Quality Score Recalibration for indels +#use rule recal_VQSE_SNVs_1rd as recal_VQSE_Indels_1rd with: + #output: + #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_indel.1rd.recal.vcf", + #idx="results/{sample}/rnaseq/indel/htcaller/{replicate}_indel.1rd.recal.vcf.idx", + #tranches="results/{sample}/rnaseq/indel/htcaller/{replicate}_indel.1rd.tranches" + #message: + #"Recalibrate variants (INDEL) on sample:{wildcards.sample} with replicate:{wildcards.replicate}" + #log: + #"logs/{sample}/gatk/vqsr/{replicate}_1rd_indel_recal.log" + #params: + #mode="BOTH", + #resources={ + #"hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0}, + #"omni": {"known": False, "training": True, "truth": True, "prior": 12.0}, + #"g1k": {"known": False, "training": True, "truth": True, "prior": 10.0}, + #"dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0}, + #}, + #annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"], + #extra="" + -use rule htc_first_snp_apply_vqsr as htc_first_indel_apply_vsqr with: - input: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf", - recal="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf", - tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.all.tranches", - ref="resources/refs/genome.fasta", - output: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf" - output: - log: - "logs/{sample}/gatk/vqsr/apply.final.indel.log" - params: - mode="INDEL", - extra="--truth-sensitivity-filter-level 99.0", # optional +#use rule htc_first_snp_apply_vqsr as htc_first_indel_apply_vsqr with: + #input: + #vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf", + #recal="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf", + #tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.all.tranches", + #ref="resources/refs/genome.fasta", + #output: + #vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf" + #output: + #log: + #"logs/{sample}/gatk/vqsr/apply.final.indel.log" + #params: + #mode="INDEL", + #extra="--truth-sensitivity-filter-level 99.0", # optional -rule gatk_baserecalibrator: - input: - bam="results/{sample}/rnaseq/align/realigned.bam", - ref="resources/refs/genome.fasta", - dict="resources/refs/genome.dict", - known=["results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf", - "results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.filt.vcf"] - output: - recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp" - log: - "logs/gatk/baserecalibrator/{sample}.log", - params: - extra="", # optional - java_opts="", # optional - resources: - mem_mb=10240, - wrapper: - "v1.31.1/bio/gatk/baserecalibrator" +#rule gatk_baserecalibrator: + #input: + #bam="results/{sample}/rnaseq/align/realigned.bam", + #ref="resources/refs/genome.fasta", + #dict="resources/refs/genome.dict", + #known=["results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf", + #"results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.filt.vcf"] + #output: + #recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp" + #log: + #"logs/gatk/baserecalibrator/{sample}.log", + #params: + #extra="", # optional + #java_opts="", # optional + #resources: + #mem_mb=10240, + #wrapper: + #"v1.31.1/bio/gatk/baserecalibrator" -rule gatk_applybqsr: - input: - bam="results/{sample}/rnaseq/align/realigned.bam", - ref="resources/refs/genome.fasta", - dict="resources/refs/genome.dict", - recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp" - output: - bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam" - log: - "logs/gatk/gatk_applybqsr/{sample}.log", - params: - extra="", # optional - java_opts="", # optional - embed_ref=True, # embed the reference in cram output - resources: - mem_mb=1024, - wrapper: - "v1.31.1/bio/gatk/applybqsr" +#rule gatk_applybqsr: + #input: + #bam="results/{sample}/rnaseq/align/realigned.bam", + #ref="resources/refs/genome.fasta", + #dict="resources/refs/genome.dict", + #recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp" + #output: + #bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam" + #log: + #"logs/gatk/gatk_applybqsr/{sample}.log", + #params: + #extra="", # optional + #java_opts="", # optional + #embed_ref=True, # embed the reference in cram output + #resources: + #mem_mb=1024, + #wrapper: + #"v1.31.1/bio/gatk/applybqsr" -rule htcaller_main: - input: - # single or list of bam files - bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam", - ref="resources/refs/genome.fasta", - known="resources/vqsr/dbSNP_b150.vcf.gz" # optional - output: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf" - log: - "logs/gatk/haplotypecaller/{sample}.1rd.log", - params: - extra="", # optional - java_opts="", # optional - threads: config['threads'] - resources: - mem_mb=1024, - wrapper: - "v1.31.1/bio/gatk/haplotypecaller" +#rule htcaller_main: + #input: + ## single or list of bam files + #bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam", + #ref="resources/refs/genome.fasta", + #known="resources/vqsr/dbSNP_b150.vcf.gz" # optional + #output: + #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf" + #log: + #"logs/gatk/haplotypecaller/{sample}.1rd.log", + #params: + #extra="", # optional + #java_opts="", # optional + #threads: config['threads'] + #resources: + #mem_mb=1024, + #wrapper: + #"v1.31.1/bio/gatk/haplotypecaller" -use rule htc_first_snp_recal as htc_final_snp_recal with: - input: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf", - ref="resources/refs/genome.fasta", - hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz", - omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz", - g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz", - dbsnp="resources/vqsr/dbSNP_b150.vcf.gz", - output: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf", - idx="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf.idx", - tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches" - log: - "logs/{sample}/gatk/vqsr/recal.final.snp.log" +#use rule htc_first_snp_recal as htc_final_snp_recal with: + #input: + #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf", + #ref="resources/refs/genome.fasta", + #hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz", + #omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz", + #g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz", + #dbsnp="resources/vqsr/dbSNP_b150.vcf.gz", + #output: + #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf", + #idx="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf.idx", + #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches" + #log: + #"logs/{sample}/gatk/vqsr/recal.final.snp.log" -use rule htc_first_snp_apply_vqsr as htc_final_snp_apply_vsqr with: - input: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf", - recal="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf", - tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches", - ref="resources/refs/genome.fasta", - output: - vcf="results/{sample}/variants/germ.snvs.vcf" - log: - "logs/{sample}/gatk/vqsr/apply.final.snp.log" +#use rule htc_first_snp_apply_vqsr as htc_final_snp_apply_vsqr with: + #input: + #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf", + #recal="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf", + #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches", + #ref="resources/refs/genome.fasta", + #output: + #vcf="results/{sample}/variants/germ.snvs.vcf" + #log: + #"logs/{sample}/gatk/vqsr/apply.final.snp.log" -use rule htc_first_snp_recal as htc_final_indel_recal with: - input: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf", - ref="resources/refs/genome.fasta", - hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz", - omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz", - g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz", - dbsnp="resources/vqsr/dbSNP_b150.vcf.gz", - output: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf", - idx="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf.idx", - tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches" - log: - "logs/{sample}/gatk/vqsr/recal.final.indel.log" - params: - mode="INDEL", - resources={ - "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0}, - "omni": {"known": False, "training": True, "truth": True, "prior": 12.0}, - "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0}, - "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0}, - }, - annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"], - extra="" +#use rule htc_first_snp_recal as htc_final_indel_recal with: + #input: + #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf", + #ref="resources/refs/genome.fasta", + #hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz", + #omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz", + #g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz", + #dbsnp="resources/vqsr/dbSNP_b150.vcf.gz", + #output: + #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf", + #idx="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf.idx", + #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches" + #log: + #"logs/{sample}/gatk/vqsr/recal.final.indel.log" + #params: + #mode="INDEL", + #resources={ + #"hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0}, + #"omni": {"known": False, "training": True, "truth": True, "prior": 12.0}, + #"g1k": {"known": False, "training": True, "truth": True, "prior": 10.0}, + #"dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0}, + #}, + #annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"], + #extra="" -use rule htc_first_snp_apply_vqsr as htc_final_indel_apply_vsqr with: - input: - vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf", - recal="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf", - tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches", - ref="resources/refs/genome.fasta", - output: - vcf="results/{sample}/variants/germ.indel.vcf" - log: - "logs/{sample}/gatk/vqsr/apply.final.indel.log" - params: - mode="INDEL", - extra="--truth-sensitivity-filter-level 99.0", # optional +#use rule htc_first_snp_apply_vqsr as htc_final_indel_apply_vsqr with: + #input: + #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf", + #recal="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf", + #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches", + #ref="resources/refs/genome.fasta", + #output: + #vcf="results/{sample}/variants/germ.indel.vcf" + #log: + #"logs/{sample}/gatk/vqsr/apply.final.indel.log" + #params: + #mode="INDEL", + #extra="--truth-sensitivity-filter-level 99.0", # optional diff --git a/workflow/rules/hlatyping.smk b/workflow/rules/hlatyping.smk index b73b824..6de2e1c 100644 --- a/workflow/rules/hlatyping.smk +++ b/workflow/rules/hlatyping.smk @@ -1,7 +1,11 @@ rule get_hla_panel: output: - dna="misc/hla/hla_reference_dna.fasta", - rna="misc/hla/hla_reference_rna.fasta" + dna="resources/hla/hla_ref_dna.fasta", + rna="resources/hla/hla_ref_rna.fasta" + conda: + "../envs/basic.yml" + log: + "logs/hla_panel.log" shell: """ curl -o {output.dna} https://raw.githubusercontent.com/FRED-2/OptiType/v1.3.5/data/hla_reference_dna.fasta @@ -10,19 +14,27 @@ rule get_hla_panel: rule index_hla_panel: input: - dna="misc/hla/hla_reference_dna.fasta", - rna="misc/hla/hla_reference_rna.fasta", + dna="resources/hla/hla_ref_dna.fasta", + rna="resources/hla/hla_ref_rna.fasta" output: - "misc/hla/hla_dna.index.lf.drp", - "misc/hla/hla_rna.index.lf.drp", + dna=multiext("resources/hla/yara/idx/dna", + ".lf.drp", ".lf.drs", ".lf.drv", + ".lf.pst", ".rid.concat", ".rid.limits", + ".sa.ind", ".sa.len", ".sa.val", + ".txt.concat", ".txt.limits", ".txt.size"), + rna=multiext("resources/hla/yara/idx/rna", + ".lf.drp", ".lf.drs", ".lf.drv", + ".lf.pst", ".rid.concat", ".rid.limits", + ".sa.ind", ".sa.len", ".sa.val", + ".txt.concat", ".txt.limits", ".txt.size") log: "logs/yara_indexer.log" conda: "../envs/yara.yml" shell: """ - yara_indexer -o misc/hla/hla.index {input.dna} > {log} - yara_indexer -o misc/hla/hla.index {input.rna} >> {log} + yara_indexer -o resources/hla/yara/idx/dna {input.dna} > {log} + yara_indexer -o resources/hla/yara/idx/rna {input.dna} >> {log} """ rule prepare_bams: @@ -37,6 +49,7 @@ rule prepare_bams: shell: "samtools merge -f - {input} | samtools fastq - > {output}" + rule filter_hla: input: reads="results/hla/all.fastq", diff --git a/workflow/rules/indel.smk b/workflow/rules/indel.smk index 10aa24f..87a7a90 100644 --- a/workflow/rules/indel.smk +++ b/workflow/rules/indel.smk @@ -1,15 +1,17 @@ import os from snakemake.remote import HTTP -rule transindel_build: +rule detect_long_indel_ti_build: input: - bam = "results/{sample}/rnaseq/align/realigned.bam", - idx = "results/{sample}/rnaseq/align/realigned.bam.bai" + bam = "results/{sample}/rnaseq/align/{replicate}_realigned.bam", + idx = "results/{sample}/rnaseq/align/{replicate}_realigned.bam.bai" output: - bam="results/{sample}/rnaseq/indel/transindel/build.bam", - idx="results/{sample}/rnaseq/indel/transindel/build.bam.bai" + bam="results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam", + idx="results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam.bai" + message: + "Building new BAM file with redefined CIGAR string using transindel build on sample:{wildcards.sample} with replicate:{wildcards.replicate}" log: - "logs/transindel/build/{sample}.log" + "logs/{sample}/transindel/{replicate}_build.log" conda: "../envs/transindel.yml" shell: @@ -18,18 +20,20 @@ rule transindel_build: -i {input.bam} \ -o {output.bam} \ -r resources/refs/genome.fasta \ - -g resources/refs/genome.gtf > {log} - samtools index {output.bam} -o {output.idx} >> {log} + -g resources/refs/genome.gtf > {log} 2>&1 + samtools index {output.bam} -o {output.idx} >> {log} 2>&1 """ -rule transindel_call: +rule detect_long_indel_ti_call: input: - bam = "results/{sample}/rnaseq/indel/transindel/build.bam", - bai = "results/{sample}/rnaseq/indel/transindel/build.bam.bai" + bam = "results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam", + bai = "results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam.bai" output: - "results/{sample}/rnaseq/indel/transindel/call.indel.vcf" + "results/{sample}/rnaseq/indel/transindel/{replicate}_call.indel.vcf" + message: + "Calling short indels using transindel on sample:{wildcards.sample} with replicate:{wildcards.replicate}" log: - "logs/transindel/call/{sample}.log" + "logs/{sample}/transindel/{replicate}_call.log" conda: "../envs/transindel.yml" params: @@ -39,59 +43,84 @@ rule transindel_call: python workflow/scripts/transIndel/transIndel_call.py \ -i {input.bam} \ -l 10 \ - -o results/{wildcards.sample}/rnaseq/indel/transindel/call \ - -m {params} + -o results/{wildcards.sample}/rnaseq/indel/transindel/{wildcards.replicate}_call \ + -m {params} > {log} 2>&1 """ # resove alleles and remove PCR slippage -rule slippage_removal: +rule long_indel_slippage_removal: input: - "results/{sample}/rnaseq/indel/transindel/call.indel.vcf" + "results/{sample}/rnaseq/indel/transindel/{replicate}_call.indel.vcf" output: - "results/{sample}/variants/long.indel.vcf" + "results/{sample}/rnaseq/indel/transindel/{replicate}_sliprem.vcf" + message: + "Resolving alleles and removing PCR slippage using transindel on sample:{wildcards.sample} with replicate:{wildcards.replicate}" log: - "logs/indel/sliprem{sample}.log" + "logs/{sample}/transindel/{replicate}_sliprem.log" conda: "../envs/transindel.yml" shell: """ python3 workflow/scripts/slippage_removal.py \ - resources/refs/genome.fasta {input} {output} > {log} + resources/refs/genome.fasta {input} {output} > {log} 2>&1 """ -rule gatk_mutect2: +# combines the replicates into one vcf +rule combine_longindels: + input: + expand("results/{sample}/rnaseq/indel/transindel/{replicate}_sliprem.vcf", + sample=config['data']['name'], + replicate=config['data']['rnaseq'].keys()) + output: + "results/{sample}/rnaseq/indel/long.indel.vcf" + message: + "Combining long indels from replicates on sample:{wildcards.sample}" + log: + "logs/{sample}/transindel/combine_replicates.log" + conda: + "../envs/manipulate_vcf.yml" + shell: + """ + python workflow/scripts/combine_vcf.py '{input}' {output} > {log} 2>&1 + """ + +# detects short somatic variants (SNVs and indels) using mutect2 +rule detect_short_indels_m2: input: fasta="resources/refs/genome.fasta", - map="results/{sample}/rnaseq/align/realigned.bam" + map="results/{sample}/rnaseq/align/{replicate}_realigned.bam" output: - vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf", - bam="results/{sample}/rnaseq/indel/mutect2/variants.bam", + vcf="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.vcf", + bam="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.bam", message: - "Detection of somatic SNVs/Indels with Mutect2 on sample {wildcards.sample}" + "Detection of somatic SNVs/Indels with Mutect2 on sample:{wildcards.sample} with replicate:{wildcards.replicate}" threads: config['threads'] resources: mem_mb=10024, params: extra="", log: - "logs/gatk/mutect2/{sample}.log", + "logs/{sample}/gatk/mutect2/{replicate}.log", wrapper: "v1.31.1/bio/gatk/mutect" -rule gatk_filtermutectcalls: +# filters short somatic variants (SNVs and indels) using FilterMutectCalls +rule filter_short_indels: input: - vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf", - bam="results/{sample}/rnaseq/indel/mutect2/variants.bam", + vcf="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.vcf", + bam="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.bam", ref="resources/refs/genome.fasta", # intervals="intervals.bed", # contamination="", # from gatk CalculateContamination # segmentation="", # from gatk CalculateContamination # f1r2="", # from gatk LearnReadOrientationBias output: - vcf="results/{sample}/rnaseq/indel/mutect2/variants_flt.vcf" + vcf="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.flt.vcf" + message: + "Filtering somatic SNVs/Indels with FilterMutectCalls on sample:{wildcards.sample} and replicate:{wildcards.replicate}" log: - "logs/gatk/filtermutect/{sample}.log", + "logs/{sample}/gatk/filtermutect/{replicate}.log", params: extra="--max-alt-allele-count 3", java_opts="", # optional @@ -100,15 +129,34 @@ rule gatk_filtermutectcalls: wrapper: "v1.31.1/bio/gatk/filtermutectcalls" - -rule gatk_select_SNPs: +rule combine_short_indels_m2: + input: + expand("results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.flt.vcf", + sample=config['data']['name'], + replicate=config['data']['rnaseq'].keys()) + output: + "results/{sample}/rnaseq/indel/mutect2/variants.vcf" + message: + "Combining somatic SNVs/Indels with Mutect2 on sample:{wildcards.sample}" + log: + "logs/{sample}/transindel/combine_replicates.log" + conda: + "../envs/manipulate_vcf.yml" + shell: + """ + python workflow/scripts/combine_vcf.py '{input}' {output} > {log} 2>&1 + """ + +rule select_SNVs_m2: input: - vcf="results/{sample}/rnaseq/indel/mutect2/variants_flt.vcf", + vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf", ref="resources/refs/genome.fasta", output: - vcf="results/{sample}/variants/snvs.vcf" + vcf="results/{sample}/rnaseq/indel/snvs.vcf" + message: + "Selecting somatic SNVs with SelectVariants on sample:{wildcards.sample}" log: - "logs/gatk/select/{sample}.snvs.log", + "logs/{sample}/gatk/select/somatic_snvs.log", params: extra="--select-type-to-include SNP", # optional filter arguments, see GATK docs java_opts="", # optional @@ -117,13 +165,14 @@ rule gatk_select_SNPs: wrapper: "v1.31.1/bio/gatk/selectvariants" - -rule gatk_select_Indels: +rule select_short_indels_m2: input: - vcf="results/{sample}/rnaseq/indel/mutect2/variants_flt.vcf", + vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf", ref="resources/refs/genome.fasta", output: - vcf="results/{sample}/variants/short.indel.vcf" + vcf="results/{sample}/rnaseq/indel/short.indel.vcf" + message: + "Selecting short somatic indels with SelectVariants on sample:{wildcards.sample}" log: "logs/gatk/select/{sample}.indel.log", params: @@ -134,85 +183,3 @@ rule gatk_select_Indels: wrapper: "v1.31.1/bio/gatk/selectvariants" - - - -# recalibrate variants (SNP) -#rule gatk_variant_recal_snp2: -# input: -# vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf", -# ref="resources/refs/genome.fasta", -# hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz", -# omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz", -# g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz", -# dbsnp="resources/vqsr/dbSNP_b150.vcf.gz", - -# output: -# vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf", -# idx="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf.idx", -# tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.all.tranches" -# log: -# "logs/gatk/variantrecalibrator/{sample}.log", - -# params: -# mode="SNP", # set mode, must be either SNP, INDEL or BOTH -# resources={ -# "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0}, -# "omni": {"known": False, "training": True, "truth": True, "prior": 12.0}, -# "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0}, -# "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0}, -# }, -# annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"], -# extra="" -# threads: config['threads'] -# resources: -# mem_mb=1024, -# wrapper: -# "v1.31.1/bio/gatk/variantrecalibrator" - - - - -#rule gatk_applybqsr: -# input: -# bam="results/indel/realign/{sample}.bam", -# ref="resources/refs/genome.fasta", -# dict="resources/refs/genome.dict", -# recal_table="recal/{sample}.grp", -# output: -# bam="recal/{sample}.bam", -# log: -# "logs/gatk/gatk_applybqsr/{sample}.log", -# params: -# extra="", # optional -# java_opts="", # optional -# embed_ref=True, # embed the reference in cram output -# resources: -# mem_mb=1024, -# wrapper: -# "v1.31.1/bio/gatk/applybqsr" - - - -#rule picard_split_vcfs: -# input: -# "results/indel/haplotypecaller/{sample}.vcf" -# output: -# snp="results/indel/{sample}.snp.vcf", -# indel="results/indel/{sample}.indel.vcf" -# log: -# "logs/spltvcfs{sample}.log" -# conda: -# "../envs/picard.yml" -# shell: -# """ -# picard SplitVcfs I={input} \ -# SNP_OUTPUT={output.snp} \ -# INDEL_OUTPUT={output.indel} -# STRICT=false -# """ - - - - - diff --git a/workflow/rules/preproc.smk b/workflow/rules/preproc.smk index 7efb4c8..5dbace5 100644 --- a/workflow/rules/preproc.smk +++ b/workflow/rules/preproc.smk @@ -1,3 +1,160 @@ +rule trimmomatic_SE: + input: + unpack(get_raw_reads) + output: + "results/{sample}/rnaseq/preproc/reads.fq.gz" + log: + "logs/{sample}/trimmomatic.log" + params: + trimmer=["TRAILING:3"], + extra="", + compression_level="-9" + threads: config['threads'] + resources: + mem_mb=1024 + wrapper: + "v2.1.1/bio/trimmomatic/se" + +# if rnaseq_readtype == "PE": +rule trimmomatic_PE: + input: + unpack(get_raw_reads) + output: + r1="results/{sample}/{seqtype}/reads/{replicate}_preproc_r1.fq.gz", + r2="results/{sample}/{seqtype}/reads/{replicate}_preproc_r2.fq.gz", + r1_unpaired="results/{sample}/{seqtype}/reads/{replicate}_preproc_r1_unpaired.fq.gz", + r2_unpaired="results/{sample}/{seqtype}/reads/{replicate}_preproc_r2_unpaired.fq.gz" + params: + trimmer=[f"MINLEN:{config['preproc']['minlen']}"] + + [f"TRAILING:{config['preproc']['trailing']}" if config['preproc']['trailing'] is not None else ""] + + [f"LEADING:{config['preproc']['trailing']}" if config['preproc']['leading'] is not None else ""] + + [f"SLIDINGWINDOW:{config['preproc']['slidingwindow']['windowsize']}:{config['preproc']['slidingwindow']['quality']}" if config['preproc']['slidingwindow']['activate'] else ""] + + [f"ILLUMINACLIP:{config['preproc']['adapters']}:2:30:10" if config['preproc']['adapters'] is not None else ""], + extra="" + log: + "logs/{sample}/trimmomatic/{replicate}_{seqtype}.log" + threads: config['threads'] + resources: + mem_mb=1024 + wrapper: + "v2.1.1/bio/trimmomatic/pe" + +rule add_rg_fastq_PE: + input: +# r1="results/{sample}/{seqtype}/reads/{replicate}_preproc_r1.fq.gz", +# r2="results/{sample}/{seqtype}/reads/{replicate}_preproc_r2.fq.gz", + unpack(get_reads), + output: + r1="results/{sample}/{seqtype}/reads/{replicate}_preproc_RG_r1.fq.gz", + r2="results/{sample}/{seqtype}/reads/{replicate}_preproc_RG_r2.fq.gz" + message: + "Adding read group information to fastq files" + log: + "logs/{sample}/add_rg/{replicate}_{seqtype}.log" + conda: + "../envs/basic.yml" + shell: + """ + bash workflow/scripts/addrgfq.sh {input.r1} > gzip -c - > {output.r1} 2> {log} + bash workflow/scripts/addrgfq.sh {input.r2} > gzip -c - > {output.r2} 2> {log} + """ + + + +checkpoint splitfastq: + input: + unpack(get_splitfastq_input) + output: + directory("results/{sample}/reads/rnaseq/{replicate}/") + log: + "logs/{sample}/splitfastq/{replicate}.log" + conda: + "../envs/splitfastq.yml" + threads: 0 + shell: + """ + python workflow/scripts/splitfastq.py '{input}' {output} 20000000 + """ + +rule star_align_fastq: + input: + fq1 = "results/{sample}/reads/rnaseq/{replicate}/r1/reads_{i}.fq.gz", + fq2 = "results/{sample}/reads/rnaseq/{replicate}/r2/reads_{i}.fq.gz", + idx = "resources/refs/star/", + output: + aln = "results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.bam", + log = "results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.log", + sj = "results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.tab" + log: + "logs/star_align/{sample}_{replicate}_{i}.log" + params: + extra="--outSAMtype BAM SortedByCoordinate --chimSegmentMin 10 --chimOutType WithinBAM HardClip --genomeSAindexNbases 10 --outSAMattributes RG --outSAMattrRGline ID:noRG" + threads: config['threads'] + wrapper: + "v1.26.0/bio/star/align" + +rule merge_alignment_results_fastq: + input: + aggregate_alignments_fastq + output: + "results/{sample}/rnaseq/align/{replicate}_aligned.bam", + log: + "logs/samtools/merge/{sample}_{replicate}.log", + params: + extra="", # optional additional parameters as string + threads: config['threads'] + wrapper: + "v1.32.1/bio/samtools/merge" + +rule star_align_pe: + input: + fq1 = "results/{sample}/rnaseq/reads/fastqfiles/r1/inputreads_{i}.fq.gz", + fq2 = "results/{sample}/rnaseq/reads/fastqfiles/r2/inputreads_{i}.fq.gz", + idx = "resources/refs/star/", + output: + aln = "results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.bam", + log = "results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.log", + sj = "results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.tab" + log: + "logs/star_align/{sample}_{i}.log" + params: + extra="--outSAMtype BAM SortedByCoordinate --chimSegmentMin 10 --chimOutType WithinBAM HardClip --genomeSAindexNbases 10 --outSAMattributes RG --outSAMattrRGline ID:noRG" + threads: config['threads'] + wrapper: + "v1.26.0/bio/star/align" + +rule merge_alignment_results_pe: + input: + aggregate_alignments_pe + output: + "results/{sample}/rnaseq/align/aligned.bam", + log: + "logs/samtools/merge/{sample}.log", + params: + extra="", # optional additional parameters as string + threads: config['threads'] + wrapper: + "v1.32.1/bio/samtools/merge" + + + #rule align_with_star_fq: + #input: + #unpack(get_align_input), + #idx="resources/refs/star/" + #output: + ## see STAR manual for additional output files + #aln="results/{sample}/rnaseq/align/aligned.bam", + #log="logs/{sample}/star/Log1.out", + #sj="results/{sample}/rnaseq/align/sj.out.tab" + #log: + #"logs/{sample}/star/Log.out" + #params: + #extra="--outSAMtype BAM SortedByCoordinate --chimSegmentMin 10 --chimOutType WithinBAM HardClip --genomeSAindexNbases 10 --outSAMattributes RG --outSAMattrRGline ID:xxx" + #threads: config['threads'] + #wrapper: + #"v2.1.1/bio/star/align" + + if rnaseq_filetype == ".bam": checkpoint split_bamfile_RG: input: @@ -17,7 +174,6 @@ if rnaseq_filetype == ".bam": -h {input} -f {output}/%!.%. {input} """ -if rnaseq_filetype == ".bam": rule bam_to_fastq: input: "results/{sample}/rnaseq/reads/bamfiles/{readgroup}.bam" @@ -34,7 +190,6 @@ if rnaseq_filetype == ".bam": | samtools fastq -OT RG -@ {threads} - | gzip -c - > {output} """ -if rnaseq_filetype == ".bam": rule align_with_star: input: fq1 = "results/{sample}/rnaseq/reads/fastqfiles/{readgroup}.fq.gz", @@ -51,7 +206,6 @@ if rnaseq_filetype == ".bam": wrapper: "v1.26.0/bio/star/align" - if rnaseq_filetype == ".bam": rule merge_alignment_results: input: @@ -69,13 +223,13 @@ if rnaseq_filetype == ".bam": rule samtools_postproc: input: - "results/{sample}/rnaseq/align/aligned.bam" + "results/{sample}/rnaseq/align/{replicate}_aligned.bam" output: - "results/{sample}/rnaseq/align/ready.bam" + "results/{sample}/rnaseq/align/{replicate}_ready.bam" conda: "../envs/samtools.yml" log: - "logs/samtools/postproc/{sample}.log" + "logs/samtools/postproc/{sample}_{replicate}.log" threads: 6 # more threads brings no significant increase shell: """ @@ -89,11 +243,11 @@ rule samtools_postproc: rule samtools_postproc_index: input: - "results/{sample}/rnaseq/align/ready.bam" + "results/{sample}/rnaseq/align/{replicate}_ready.bam" output: - "results/{sample}/rnaseq/align/ready.bam.bai" + "results/{sample}/rnaseq/align/{replicate}_ready.bam.bai" log: - "logs/samtools/index/{sample}.log" + "logs/samtools/index/postproc_{sample}_{replicate}.log" params: extra="", # optional additional parameters as string threads: config['threads'] @@ -102,31 +256,37 @@ rule samtools_postproc_index: # retrieve readgroups from bam file -if rnaseq_filetype == ".bam": - rule determine_readgroups: - input: - get_rnaseq_data - output: - "results/{sample}/rnaseq/reads/readgroups.txt" - log: - "logs/readgroups/{sample}.log" - shell: - """ - python workflow/scripts/get_readgroups.py {input} \ - {output} > {log} 2>&1 - """ +rule get_readgroups: + input: + get_readgroups_input + #"results/{sample}/rnaseq/align/{replicate}_ready.bam" + output: + "results/{sample}/rnaseq/reads/{replicate}_readgroups.txt" + conda: + "../envs/basic.yml" + log: + "logs/{sample}/get_readgroups/{replicate}.log" + shell: + """ + python workflow/scripts/get_readgroups.py '{input}' \ + {output} > {log} 2>&1 + """ + + + + rule realign: input: - bam="results/{sample}/rnaseq/align/ready.bam", - rg="results/{sample}/rnaseq/reads/readgroups.txt" + bam="results/{sample}/rnaseq/align/{replicate}_ready.bam", + rg="results/{sample}/rnaseq/reads/{replicate}_readgroups.txt" output: - "results/{sample}/rnaseq/align/realigned.bam" + "results/{sample}/rnaseq/align/{replicate}_realigned.bam" threads: config['threads'] shell: """ - samtools collate -Oun128 {input.bam} \ - | samtools fastq -OT RG,BC - \ + samtools collate -Oun128 {input.bam} \ + | samtools fastq -OT RG -@ {threads} - \ | bwa mem -pt{threads} -CH <(cat {input.rg}) resources/refs/bwa/genome - \ | samtools sort -@6 -m1g - > {output} """ @@ -134,11 +294,11 @@ rule realign: rule realign_index: input: - "results/{sample}/rnaseq/align/realigned.bam" + "results/{sample}/rnaseq/align/{replicate}_realigned.bam" output: - "results/{sample}/rnaseq/align/realigned.bam.bai" + "results/{sample}/rnaseq/align/{replicate}_realigned.bam.bai" log: - "logs/samtools/index/{sample}.log" + "logs/{sample}/realign_index/{sample}_{replicate}.log" params: extra="", # optional additional parameters as string threads: config['threads'] diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 4b7dff2..4c8c120 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -81,3 +81,20 @@ rule bwa_index: algorithm="bwtsw", wrapper: "v1.26.0/bio/bwa/index" + + +rule create_sequence_dictionary: + input: + "resources/refs/genome.fasta" + output: + "resources/refs/genome.dict" + message: + "Create sequence dictionary of reference genome" + log: + "logs/picard/create_dict.log" + params: + extra="", # optional: extra arguments for picard. + resources: + mem_mb=1024, + wrapper: + "v1.31.1/bio/picard/createsequencedictionary"