diff --git a/README.md b/README.md
index baff5e3..498d8c2 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,22 @@
-
-
ScanNeo2 (working title)
+-
+
ScanNeo2
+
-
## What is ScanNeo2
-`Scanneo2` is a snakemake workflow for the prediction of neoantigens from
-multiple sources. In its current state, this includes
+`Scanneo2` is a snakemake workflow for the prediction of neoantigens from multiple sources. In its current state,
+this includes canonical-splicing, exitron-splicing, gene fusion, indels and snvs.
## Getting Started
-In principle, Scanneo2 aims to
-
+In principle, Scanneo2 aims to resolve its dependencies automatically and requires only snakemake and snakedeploy.
## Quickstart
Install `snakemake` and `snakedeploy`
```
-mamba env create --file https://
+mamba env create --file https://github.com/ylab-hi/ScanNeo2/blob/devel/environment.yml
mamba activate scanneo2
```
Deploy Scanneo2
@@ -26,17 +25,19 @@ mkdir -p /path/to/working/directory/
cd /path/to/working/directory/
snakedeploy deploy-workflow https://github.com/ylab-hi/scanneo2 . --tag v0.1.0
```
+Configure ScanNeo2 by modifying `config/config.yml`
+
Run the workflow
```
+cd scanneo
snakemake --cores all --use-conda
```
-
-Please consult the wiki for detailed instruction and explanations.
-
+Please consult the [wiki](https://github.com/ylab-hi/ScanNeo2/wiki) for detailed instruction and explanations.
### Docker
-We also provide a ready to use Docker Container that can be used
+We also provide a ready-to-use [Docker Container](https://hub.docker.com/r/yanglabinfo/scanneo2)
+that can be used to use Scanneo2.
diff --git a/config/config.yaml b/config/config.yaml
index 65e1682..da9d3ad 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,11 +1,30 @@
-refgen: testdata/GRCh38.p13.genome.fa
-annotation: testdata/GRCh38_chr1.gtf
+data:
+ name: patient2
+ dnaseq:
+ rep1: TESLA_testdata/patient2/WES/TESLA_9_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_9_2.fastq.gz
+ rep2: TESLA_testdata/patient2/WES/TESLA_10_1.fastq.gz TESLA_testdata/patient2/WES/TESLA_10_2.fastq.gz
+ rnaseq:
+ rep1: TESLA_testdata/patient2/RNA/TESLA_11_1.fastq.gz TESLA_testdata/patient2/RNA/TESLA_11_2.fastq.gz
+
+
+
+sample: patient2 # naming of the sample (results/sample)
dnaseq:
rnaseq: LNCAP_bam/G28033.LNCaP_clone_FGC.1.bam
-preproc: true
+### pre-processing (only applied on fastq reads)
+preproc:
+ activate: true # whether (=true) or not (=false) to include pre-processing
+ minlen: 10 # discard reads which are less than
bases
+ leading: 3 # remove leading low quality or N bases
+ trailing: 3 # remove trailing low quality or N bases
+ slidingwindow:
+ activate: true # whether (=true) or not (=false) to include pre-processing
+ windowsize: 3 # number of bases to average across
+ quality: 20 # the average quality (Phred) required
+ adapters: TruSeq2-PE.fa # path to fasta file containg adapters to be trimmed
# if no readgroups file are provided
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 799764a..6ebeb62 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -1,61 +1,251 @@
import os
-import re
import glob
+from icecream import ic
from pathlib import Path
+def data_structure(data):
+ config['data']['dnaseq'], filetype, readtype = handle_seqfiles(config['data']['dnaseq'])
+ config['data']['dnaseq_filetype'] = filetype
+ config['data']['dnaseq_readtype'] = readtype
+ config['data']['rnaseq'], filetype, readtype = handle_seqfiles(config['data']['rnaseq'])
+ config['data']['rnaseq_filetype'] = filetype
+ config['data']['rnaseq_readtype'] = readtype
+ return config['data']
+
+
+def handle_seqfiles(seqdata):
+ readtype = []
+ filetype = []
+
+ # iterate over replicates
+ for rpl in seqdata.keys():
+ files = [Path(file) for file in seqdata[rpl].split(' ')]
+ ic(files)
+ if len(files) == 1: # SE
+ f1_ext = get_file_extension(files[0])
+ if f1_ext in ['.fq', '.fastq', '.bam']:
+ seqdata[rpl] = files[0]
+ filetype.append(f1_ext)
+ readtype.append('SE')
+ else:
+ print('{} is not a valid file'.format(files[0]))
+ elif len(files) == 2: # PE
+ f1_ext = get_file_extension(files[0])
+ f2_ext = get_file_extension(files[1])
+ # check if file extensions are the same
+ if f1_ext == f2_ext:
+ if(valid_paired_end(files[0], files[1])):
+ seqdata[rpl] = files
+ filetype.append(f1_ext)
+ readtype.append('PE')
+ else:
+ print('files not in valid PE format')
+ else:
+ print('files do not have the same extension')
+
+ # check if filetype and readtype are the same
+ if all_identical(filetype) and all_identical(readtype):
+ return seqdata, filetype[0], readtype[0]
+ else:
+ print('filetypes are not the same')
+ return seqdata, None, None
+
+# pre-processing for RNAseq data
+def get_raw_reads(wildcards):
+ if config['data']['rnaseq_readtype'] == 'SE':
+ return config['data']['rnaseq'][wildcards.sample]
+ else: # PE
+ return dict(
+ zip(
+ ["r1", "r2"],
+ [config['data'][wildcards.seqtype][wildcards.replicate][0],
+ config['data'][wildcards.seqtype][wildcards.replicate][1]],
+ )
+ )
+
+# returns the reads (raw/preprocessed) for a given sample
+def get_reads(wildcards):
+ if config['preproc']['activate']:
+ if config['data'][wildcards.seqtype+'_readtype'] == 'SE':
+ return config['data'][wildcards.seqtype][wildcards.replicate]
+ elif config['data'][wildcards.seqtype+'_readtype'] == 'PE':
+ print("yes")
+ return {"r1": "results/{sample}/{seqtype}/reads/{replicate}_preproc_r1.fq.gz",
+ "r2": "results/{sample}/{seqtype}/reads/{replicate}_preproc_r2.fq.gz"}
+
+
+# determines the file extension for a given file - excludes .gz
+def get_file_extension(path):
+ filename = path.name
+ extpat = r'\.(fastq|fq|bam)(\.gz)?$'
+ res = re.search(extpat, filename)
+ file_ext = ''
+ if res is not None:
+ if res.group(0).endswith('.gz'):
+ file_ext = filename[res.start():-3]
+ else:
+ file_ext = filename[res.start():]
+ return file_ext
+
# check if files are a valid paired-end pair
-def valid_paired_end(file1, file2):
- # check if both files are in FASTA format
- if Path(file1).suffix not in ['.fastq', '.fq'] and Path(file2).suffix not in ['.fastq', '.fq']:
- return False
-
- # check if first file contains _R1 or _fwd
- if not ("_R1" in file1 or "_fwd" in file1):
- return False
- # check if second file contains _R2 or _rev
- if not ("_R2" in file2 or "_rev" in file2):
- return False
-
- # check if the substrings until occurrence of either _R1/2 or _fwd/rev are equal
- file1_indicator = "_R1" if "_R1" in file1 else "_fwd"
- file2_indicator = "_R2" if "_R2" in file2 else "_rev"
- file1_idx = file1.find(file1_indicator)
- file2_idx = file2.find(file2_indicator)
-
- if file1_idx != file2_idx:
- return False
-
- if file1[:file1_idx] != file2[:file2_idx]:
- return False
+def valid_paired_end(path1, path2):
+ valid = False
+
+ # only consider filename
+ file1 = path1.name
+ file2 = path2.name
- # check if first file contains _R1 and the secon file contaif "_R1" in string1 and "_R2" not in string2:
- return False
-
- # check if first file contains _fwd and the secon file contains _rev
- if "_fwd" in string1 and "_rev" not in string2:
- return False
+ # check if first file contains _R1, _1 or _fwd
+ pattern = r'\_(R1|R2|1|2|fwd|rev)\.(fastq|fq){1}(\.gz)?$'
+ f1_se = re.search(pattern, file1)
+ f2_se = re.search(pattern, file1)
+
+ # patterns needs to be found in both files
+ if f1_se is not None and f2_se is not None:
+ if file1[:f1_se.start()] == file2[:f2_se.start()]:
+ valid = True
+ else:
+ print('{} and {} have different filestem '.format(file1, file2))
+ else:
+ print('{} and {} are not valid PE files'.format(file1, file2))
+
+ return valid
+# check if files in list are identical
+def all_identical(l):
+ if l.count(l[0]) == len(l):
return True
+ else:
+ return False
-rnaseq_input = {}
-rnaseq_filetype = None
-rnaseq_files = config['rnaseq'].split(' ')
-if len(rnaseq_files) == 1:
- if Path(rnaseq_files[0]).suffix in ['.fq', '.fastq', '.bam']:
- rnaseq_filetype = Path(rnaseq_files[0]).suffix # store file extension
- rnaseq_input[Path(rnaseq_files[0]).stem] = rnaseq_files[0]
- else:
- print("no rnaseq files found")
-elif len(rnaseq_files) == 2:
- # check if both files are in valid paired-end format
- if valid_paired_end(rnaseq_files[0], rnaseq_files[1]):
- rnaseq_filetype = Path(rnaseq_files[0]).suffix # store file extension
- rnaseq_input.append((rnaseq_files[0], rnaseq_files[1]))
- else:
- print("no valid paired-end files found")
+config['data'] = data_structure(config['data'])
+ic(config['data'])
+
+
+rnaseq_filetype = ".bam"
+
+def get_splitfastq_input(wildcards):
+ if config['preproc']['activate']:
+ if config['data'][wildcards.seqtype] == 'SE':
+ return expand("results/{sample}/{seqtype}/preproc/reads.fq.gz", **wildcards)
+ elif config['data'][wildcards.seqtype] == 'PE': # PE
+ return expand("results/{sample}/reads/{seqtype}/{replicate}_preproc_{readtype}.fq.gz",
+ readtype=["r1", "r2"],
+ seqtype=wildcards.seqtype,
+ replicate = wildcards.replicate,
+ sample = wildcards.sample)
+
+ else: # no pre-processing has been performed
+ return rnaseq_input[wildcards.sample]
+
+
+def get_splitfastq_input_PE(wildcards):
+ if config['preproc']['activate']:
+ return expand("results/{sample}/rnaseq/reads/inputreads_{readtype}.fq.gz",
+ readtype=["r1", "r2"],
+ **wildcards
+ )
+ else: # no pre-processing
+ return rnaseq_input[wildcards.sample]
+
+
+
+# input for alignment w/ DNAseq data
+def get_align_input_dnaseq(wildcards):
+ if config['preproc']['activate']:
+
+
+
+
+ if config['data']['dnaseq_readtype'] == 'SE':
+ return expand("results/{sample}/dnaseq/reads/inputreads.fq.gz", **wildcards)
+ else: # PE
+ return dict(
+ zip(
+ ["fq1", "fq2"],
+ expand("results/{sample}/dnaseq/reads/inputreads_{readtype}.fq.gz",
+ readtype=["r1", "r2"],
+ **wildcards
+ )
+ )
+ )
+ else: # no pre-processing
+ if config['data']['dnaseq_readtype'] == 'SE':
+ return dnaseq_input[wildcards.sample]
+ else: # PE
+ return dict(
+ zip(
+ ["fq1", "fq2"],
+ dnaseq_input[wildcards.sample]
+ )
+ )
+
+
+
+def get_align_input(wildcards):
+ if config['preproc']['activate']:
+ if rnaseq_readtype == "se":
+ return expand("results/{sample}/rnaseq/reads/inputreads.fq.gz", **wildcards)
+ else: # pe
+ ic("in here")
+ return dict(
+ zip(
+ ["fq1", "fq2"],
+ expand("results/{sample}/rnaseq/reads/inputreads_{readtype}.fq.gz",
+ readtype=["r1", "r2"],
+ **wildcards
+ )
+ )
+ )
+ else: # no pre-processing
+ if rnaseq_readtype == "se":
+ return rnaseq_input[wildcards.sample]
+ else: # pe
+ return dict(
+ zip(
+ ["fq1", "fq2"],
+ rnaseq_input[wildcards.sample]
+ )
+ )
+
+# determine the bamfiles that contains readgroups
+#def get_bams_readsgroups(wildcards):
+
+# fq1 = "results/{sample}/rnaseq/reads/{replicate}/r1/reads_{i}.fq.gz",
+# aln = "results/{sample}/rnaseq/align/{replicate}/reads_{i}.bam",
+
+
+def get_readgroups_input(wildcards):
+ # return only bam from STAR align
+ if config['data']['rnaseq_filetype'] in ['.fq','.fastq']:
+ return ["results/{sample}/rnaseq/align/{replicate}_ready.bam".format(**wildcards)]
+ elif config['data']['rnaseq_readtype'] in ['.bam']:
+ val = []
+ val.append(config['data']['rnaseq'][wildcards.replicate][wildcards.sample])
+ val.append(extend("results/{sample}/rnaseq/align/{replicate}/ready.bam",
+ sample=wildcards.sample,
+ replicate=wildcards.replicate))
+
+
+
+def aggregate_alignments_fastq(wildcards):
+ # make sure that all samples are processed in checkpoint - split fastq file
+ checkpoint_output = checkpoints.splitfastq.get(**wildcards).output[0]
+ return expand("results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.bam",
+ sample=wildcards.sample,
+ replicate=wildcards.replicate,
+ i=glob_wildcards(os.path.join(checkpoint_output, "r1/reads_{i}.fq.gz")).i)
+
+def aggregate_alignments_pe(wildcards):
+ # make sure that all samples are processed in checkpoint - split fastq file
+ checkpoint_output = checkpoints.splitfastq_pe.get(**wildcards).output[0]
+ return expand("results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.bam",
+ sample=wildcards.sample,
+ i=glob_wildcards(os.path.join(checkpoint_output, "inputreads_{i}.fq.gz")).i)
+
# aggregate results from STAR alignment
def aggregate_alignments(wildcards):
# make sure that all samples are processed in checkpoint - split fastq file
@@ -65,19 +255,16 @@ def aggregate_alignments(wildcards):
readgroup=glob_wildcards(os.path.join(checkpoint_output, "{readgroup}.bam")).readgroup)
-def aggregate_bwa_alignments(wildcards):
- split_bamfiles_output = checkpoints.bamfile_split.get(**wildcards).output[0]
- return expand("results/{sample}/rnaseq/align/bamfiles/{file}.bam",
- sample=wildcards.sample,
- file=glob_wildcards(os.path.join(split_bamfiles_output, "{file}.bam")).file)
-
-
+# getting input starting from align
def get_rnaseq_data(wildcards):
+ print(rnaseq_input)
if rnaseq_filetype == ".bam":
- print(wildcards)
return rnaseq_input[wildcards.sample]
elif rnaseq_filetype == ".fastq" or rnaseq_filetype == ".fq":
- return rnaseq_input[wildcards.sample]
+ if config['preproc']['activate']: # preproc activated?
+ if len(rnaseq_input[wildcards.sample] == 2): # PE?
+ return expand("results/{sample}/preproc/trimmed/trm.fq.gz",
+ **wildcards)
else:
print("no rnaseq data found")
diff --git a/workflow/rules/germline.smk b/workflow/rules/germline.smk
index 8ad79e9..01b4129 100644
--- a/workflow/rules/germline.smk
+++ b/workflow/rules/germline.smk
@@ -1,25 +1,6 @@
-rule picard_create_dict:
- input:
- "resources/refs/genome.fasta"
- output:
- "resources/refs/genome.dict"
- log:
- "logs/picard/create_dict.log"
- params:
- extra="", # optional: extra arguments for picard.
- # optional specification of memory usage of the JVM that snakemake will respect with global
- # resource restrictions (https://snakemake.readthedocs.io/en/latest/snakefiles/rules.html#resources)
- # and which can be used to request RAM during cluster job submission as `{resources.mem_mb}`:
- # https://snakemake.readthedocs.io/en/latest/executing/cluster.html#job-properties
- resources:
- mem_mb=1024,
- wrapper:
- "v1.31.1/bio/picard/createsequencedictionary"
-
-
# download training sets for calling high confidence variants
# see https://gatk.broadinstitute.org/hc/en-us/articles/4402736812443-Which-training-sets-arguments-should-I-use-for-running-VQSR-
-rule gatk_vqsr_training_sets:
+rule get_gatk_vqsr_training_sets:
output:
snp_hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
snp_hapmap_idx="resources/vqsr/hapmap_3.3.hg38.vcf.gz.tbi",
@@ -31,7 +12,8 @@ rule gatk_vqsr_training_sets:
snp_dbSNP_idx="resources/vqsr/dbSNP_b150.vcf.gz.tbi",
indel_mills="resources/vqsr/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
indel_mills_idx="resources/vqsr/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
-
+ message:
+ "Downloading training sets for calling high confidence variants"
shell:
"""
curl -L https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/hapmap_3.3.hg38.vcf.gz -o resources/vqsr/hapmap_3.3.hg38.vcf.gz
@@ -47,16 +29,18 @@ rule gatk_vqsr_training_sets:
"""
# do a first round of variant calling on original, unrecalibrated data
-rule htc_first:
+rule detect_indels_htc_1rd:
input:
# single or list of bam files
- bam="results/{sample}/rnaseq/align/realigned.bam",
+ bam="results/{sample}/rnaseq/align/{replicate}_realigned.bam",
ref="resources/refs/genome.fasta",
known="resources/vqsr/dbSNP_b150.vcf.gz" # optional
output:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf"
+ vcf="results/{sample}/variants/indel/htcaller/{replicate}_variants.1rd.vcf"
+ message:
+ "First round of variant calling on original, unrecalibrated data on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
log:
- "logs/{sample}/gatk/haplotypecaller/1rd.log",
+ "logs/{sample}/gatk/haplotypecaller/{replicate}_1rd.log",
params:
extra="",
java_opts="",
@@ -68,216 +52,223 @@ rule htc_first:
# recalibrate variants (SNP)
-rule htc_first_snp_recal:
- input:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf",
- ref="resources/refs/genome.fasta",
- hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
- omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
- g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
- dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
- output:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf",
- idx="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf.idx",
- tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.all.tranches"
- log:
- "logs/{sample}/gatk/vqsr/recal.first.snp"
+#rule recalibrate_variants_first_round:
+ #input:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_variants.1rd.vcf",
+ #ref="resources/refs/genome.fasta",
+ #hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
+ #omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
+ #g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+ #dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
+ #output:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.recal.vcf",
+ #idx="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.recal.vcf.idx",
+ #tranches="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.tranches"
+ #message:
+ #"Recalibrate variants (SNP) on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
+ #log:
+ #"logs/{sample}/gatk/vqsr/{replicate}_1rd_snv_recal.log"
- params:
- mode="SNP",
- resources={
- "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
- "omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
- "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
- "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
- },
- annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
- extra=""
- threads: config['threads']
- resources:
- mem_mb=1024,
- wrapper:
- "v1.31.1/bio/gatk/variantrecalibrator"
+ #params:
+ #mode="SNP",
+ #resources={
+ #"hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
+ #"omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
+ #"g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
+ #"dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
+ #},
+ #annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
+ #extra=""
+ #threads: config['threads']
+ #resources:
+ #mem_mb=1024,
+ #wrapper:
+ #"v1.31.1/bio/gatk/variantrecalibrator"
-rule htc_first_snp_apply_vqsr:
- input:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf",
- recal="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf",
- tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.all.tranches",
- ref="resources/refs/genome.fasta",
- output:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.filt.vcf"
- log:
- "logs/{sample}/gatk/vqsr/apply.1rd.snp.log"
- params:
- mode="SNP", # set mode, must be either SNP, INDEL or BOTH
- extra="--truth-sensitivity-filter-level 99.5", # optional
- resources:
- mem_mb=1024,
- wrapper:
- "v1.31.1/bio/gatk/applyvqsr"
+#rule apply_VQSR_SNVs_1rd:
+ #input:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_variants.1rd.vcf",
+ #recal="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.recal.vcf",
+ #tranches="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.tranches",
+ #ref="resources/refs/genome.fasta",
+ #output:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_snv.1rd.flt.vcf"
+ #message:
+ #"Apply VQSR (SNP) on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
+ #log:
+ #"logs/{sample}/gatk/vqsr/{replicate}_apply.1rd.snp.log"
+ #params:
+ #mode="SNP", # set mode, must be either SNP, INDEL or BOTH
+ #extra="--truth-sensitivity-filter-level 99.5", # optional
+ #resources:
+ #mem_mb=1024,
+ #wrapper:
+ #"v1.31.1/bio/gatk/applyvqsr"
-# repeat Variant Quality Score Recalibration for indels
-use rule htc_first_snp_recal as htc_first_indel_recal with:
- output:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf",
- idx="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf.idx",
- tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.all.tranches"
- log:
- "logs/{sample}/gatk/vqsr/recal_first_snp.log"
- params:
- mode="INDEL",
- resources={
- "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
- "omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
- "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
- "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
- },
- annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
- extra=""
+### repeat Variant Quality Score Recalibration for indels
+#use rule recal_VQSE_SNVs_1rd as recal_VQSE_Indels_1rd with:
+ #output:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/{replicate}_indel.1rd.recal.vcf",
+ #idx="results/{sample}/rnaseq/indel/htcaller/{replicate}_indel.1rd.recal.vcf.idx",
+ #tranches="results/{sample}/rnaseq/indel/htcaller/{replicate}_indel.1rd.tranches"
+ #message:
+ #"Recalibrate variants (INDEL) on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
+ #log:
+ #"logs/{sample}/gatk/vqsr/{replicate}_1rd_indel_recal.log"
+ #params:
+ #mode="BOTH",
+ #resources={
+ #"hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
+ #"omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
+ #"g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
+ #"dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
+ #},
+ #annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
+ #extra=""
+
-use rule htc_first_snp_apply_vqsr as htc_first_indel_apply_vsqr with:
- input:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf",
- recal="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf",
- tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.all.tranches",
- ref="resources/refs/genome.fasta",
- output:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf"
- output:
- log:
- "logs/{sample}/gatk/vqsr/apply.final.indel.log"
- params:
- mode="INDEL",
- extra="--truth-sensitivity-filter-level 99.0", # optional
+#use rule htc_first_snp_apply_vqsr as htc_first_indel_apply_vsqr with:
+ #input:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf",
+ #recal="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.recal.vcf",
+ #tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.all.tranches",
+ #ref="resources/refs/genome.fasta",
+ #output:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf"
+ #output:
+ #log:
+ #"logs/{sample}/gatk/vqsr/apply.final.indel.log"
+ #params:
+ #mode="INDEL",
+ #extra="--truth-sensitivity-filter-level 99.0", # optional
-rule gatk_baserecalibrator:
- input:
- bam="results/{sample}/rnaseq/align/realigned.bam",
- ref="resources/refs/genome.fasta",
- dict="resources/refs/genome.dict",
- known=["results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf",
- "results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.filt.vcf"]
- output:
- recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp"
- log:
- "logs/gatk/baserecalibrator/{sample}.log",
- params:
- extra="", # optional
- java_opts="", # optional
- resources:
- mem_mb=10240,
- wrapper:
- "v1.31.1/bio/gatk/baserecalibrator"
+#rule gatk_baserecalibrator:
+ #input:
+ #bam="results/{sample}/rnaseq/align/realigned.bam",
+ #ref="resources/refs/genome.fasta",
+ #dict="resources/refs/genome.dict",
+ #known=["results/{sample}/rnaseq/indel/htcaller/variants.1rd.indel.filt.vcf",
+ #"results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.filt.vcf"]
+ #output:
+ #recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp"
+ #log:
+ #"logs/gatk/baserecalibrator/{sample}.log",
+ #params:
+ #extra="", # optional
+ #java_opts="", # optional
+ #resources:
+ #mem_mb=10240,
+ #wrapper:
+ #"v1.31.1/bio/gatk/baserecalibrator"
-rule gatk_applybqsr:
- input:
- bam="results/{sample}/rnaseq/align/realigned.bam",
- ref="resources/refs/genome.fasta",
- dict="resources/refs/genome.dict",
- recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp"
- output:
- bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam"
- log:
- "logs/gatk/gatk_applybqsr/{sample}.log",
- params:
- extra="", # optional
- java_opts="", # optional
- embed_ref=True, # embed the reference in cram output
- resources:
- mem_mb=1024,
- wrapper:
- "v1.31.1/bio/gatk/applybqsr"
+#rule gatk_applybqsr:
+ #input:
+ #bam="results/{sample}/rnaseq/align/realigned.bam",
+ #ref="resources/refs/genome.fasta",
+ #dict="resources/refs/genome.dict",
+ #recal_table="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.grp"
+ #output:
+ #bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam"
+ #log:
+ #"logs/gatk/gatk_applybqsr/{sample}.log",
+ #params:
+ #extra="", # optional
+ #java_opts="", # optional
+ #embed_ref=True, # embed the reference in cram output
+ #resources:
+ #mem_mb=1024,
+ #wrapper:
+ #"v1.31.1/bio/gatk/applybqsr"
-rule htcaller_main:
- input:
- # single or list of bam files
- bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam",
- ref="resources/refs/genome.fasta",
- known="resources/vqsr/dbSNP_b150.vcf.gz" # optional
- output:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf"
- log:
- "logs/gatk/haplotypecaller/{sample}.1rd.log",
- params:
- extra="", # optional
- java_opts="", # optional
- threads: config['threads']
- resources:
- mem_mb=1024,
- wrapper:
- "v1.31.1/bio/gatk/haplotypecaller"
+#rule htcaller_main:
+ #input:
+ ## single or list of bam files
+ #bam="results/{sample}/rnaseq/indel/htcaller/variants.1rd.baserecal.bam",
+ #ref="resources/refs/genome.fasta",
+ #known="resources/vqsr/dbSNP_b150.vcf.gz" # optional
+ #output:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf"
+ #log:
+ #"logs/gatk/haplotypecaller/{sample}.1rd.log",
+ #params:
+ #extra="", # optional
+ #java_opts="", # optional
+ #threads: config['threads']
+ #resources:
+ #mem_mb=1024,
+ #wrapper:
+ #"v1.31.1/bio/gatk/haplotypecaller"
-use rule htc_first_snp_recal as htc_final_snp_recal with:
- input:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
- ref="resources/refs/genome.fasta",
- hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
- omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
- g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
- dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
- output:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf",
- idx="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf.idx",
- tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches"
- log:
- "logs/{sample}/gatk/vqsr/recal.final.snp.log"
+#use rule htc_first_snp_recal as htc_final_snp_recal with:
+ #input:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
+ #ref="resources/refs/genome.fasta",
+ #hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
+ #omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
+ #g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+ #dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
+ #output:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf",
+ #idx="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf.idx",
+ #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches"
+ #log:
+ #"logs/{sample}/gatk/vqsr/recal.final.snp.log"
-use rule htc_first_snp_apply_vqsr as htc_final_snp_apply_vsqr with:
- input:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
- recal="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf",
- tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches",
- ref="resources/refs/genome.fasta",
- output:
- vcf="results/{sample}/variants/germ.snvs.vcf"
- log:
- "logs/{sample}/gatk/vqsr/apply.final.snp.log"
+#use rule htc_first_snp_apply_vqsr as htc_final_snp_apply_vsqr with:
+ #input:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
+ #recal="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.recal.vcf",
+ #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.snp.all.tranches",
+ #ref="resources/refs/genome.fasta",
+ #output:
+ #vcf="results/{sample}/variants/germ.snvs.vcf"
+ #log:
+ #"logs/{sample}/gatk/vqsr/apply.final.snp.log"
-use rule htc_first_snp_recal as htc_final_indel_recal with:
- input:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
- ref="resources/refs/genome.fasta",
- hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
- omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
- g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
- dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
- output:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf",
- idx="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf.idx",
- tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches"
- log:
- "logs/{sample}/gatk/vqsr/recal.final.indel.log"
- params:
- mode="INDEL",
- resources={
- "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
- "omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
- "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
- "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
- },
- annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
- extra=""
+#use rule htc_first_snp_recal as htc_final_indel_recal with:
+ #input:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
+ #ref="resources/refs/genome.fasta",
+ #hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
+ #omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
+ #g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+ #dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
+ #output:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf",
+ #idx="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf.idx",
+ #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches"
+ #log:
+ #"logs/{sample}/gatk/vqsr/recal.final.indel.log"
+ #params:
+ #mode="INDEL",
+ #resources={
+ #"hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
+ #"omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
+ #"g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
+ #"dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
+ #},
+ #annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
+ #extra=""
-use rule htc_first_snp_apply_vqsr as htc_final_indel_apply_vsqr with:
- input:
- vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
- recal="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf",
- tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches",
- ref="resources/refs/genome.fasta",
- output:
- vcf="results/{sample}/variants/germ.indel.vcf"
- log:
- "logs/{sample}/gatk/vqsr/apply.final.indel.log"
- params:
- mode="INDEL",
- extra="--truth-sensitivity-filter-level 99.0", # optional
+#use rule htc_first_snp_apply_vqsr as htc_final_indel_apply_vsqr with:
+ #input:
+ #vcf="results/{sample}/rnaseq/indel/htcaller/variants.final.vcf",
+ #recal="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.recal.vcf",
+ #tranches="results/{sample}/rnaseq/indel/htcaller/variants.final.indel.all.tranches",
+ #ref="resources/refs/genome.fasta",
+ #output:
+ #vcf="results/{sample}/variants/germ.indel.vcf"
+ #log:
+ #"logs/{sample}/gatk/vqsr/apply.final.indel.log"
+ #params:
+ #mode="INDEL",
+ #extra="--truth-sensitivity-filter-level 99.0", # optional
diff --git a/workflow/rules/hlatyping.smk b/workflow/rules/hlatyping.smk
index b73b824..6de2e1c 100644
--- a/workflow/rules/hlatyping.smk
+++ b/workflow/rules/hlatyping.smk
@@ -1,7 +1,11 @@
rule get_hla_panel:
output:
- dna="misc/hla/hla_reference_dna.fasta",
- rna="misc/hla/hla_reference_rna.fasta"
+ dna="resources/hla/hla_ref_dna.fasta",
+ rna="resources/hla/hla_ref_rna.fasta"
+ conda:
+ "../envs/basic.yml"
+ log:
+ "logs/hla_panel.log"
shell:
"""
curl -o {output.dna} https://raw.githubusercontent.com/FRED-2/OptiType/v1.3.5/data/hla_reference_dna.fasta
@@ -10,19 +14,27 @@ rule get_hla_panel:
rule index_hla_panel:
input:
- dna="misc/hla/hla_reference_dna.fasta",
- rna="misc/hla/hla_reference_rna.fasta",
+ dna="resources/hla/hla_ref_dna.fasta",
+ rna="resources/hla/hla_ref_rna.fasta"
output:
- "misc/hla/hla_dna.index.lf.drp",
- "misc/hla/hla_rna.index.lf.drp",
+ dna=multiext("resources/hla/yara/idx/dna",
+ ".lf.drp", ".lf.drs", ".lf.drv",
+ ".lf.pst", ".rid.concat", ".rid.limits",
+ ".sa.ind", ".sa.len", ".sa.val",
+ ".txt.concat", ".txt.limits", ".txt.size"),
+ rna=multiext("resources/hla/yara/idx/rna",
+ ".lf.drp", ".lf.drs", ".lf.drv",
+ ".lf.pst", ".rid.concat", ".rid.limits",
+ ".sa.ind", ".sa.len", ".sa.val",
+ ".txt.concat", ".txt.limits", ".txt.size")
log:
"logs/yara_indexer.log"
conda:
"../envs/yara.yml"
shell:
"""
- yara_indexer -o misc/hla/hla.index {input.dna} > {log}
- yara_indexer -o misc/hla/hla.index {input.rna} >> {log}
+ yara_indexer -o resources/hla/yara/idx/dna {input.dna} > {log}
+ yara_indexer -o resources/hla/yara/idx/rna {input.dna} >> {log}
"""
rule prepare_bams:
@@ -37,6 +49,7 @@ rule prepare_bams:
shell:
"samtools merge -f - {input} | samtools fastq - > {output}"
+
rule filter_hla:
input:
reads="results/hla/all.fastq",
diff --git a/workflow/rules/indel.smk b/workflow/rules/indel.smk
index 10aa24f..87a7a90 100644
--- a/workflow/rules/indel.smk
+++ b/workflow/rules/indel.smk
@@ -1,15 +1,17 @@
import os
from snakemake.remote import HTTP
-rule transindel_build:
+rule detect_long_indel_ti_build:
input:
- bam = "results/{sample}/rnaseq/align/realigned.bam",
- idx = "results/{sample}/rnaseq/align/realigned.bam.bai"
+ bam = "results/{sample}/rnaseq/align/{replicate}_realigned.bam",
+ idx = "results/{sample}/rnaseq/align/{replicate}_realigned.bam.bai"
output:
- bam="results/{sample}/rnaseq/indel/transindel/build.bam",
- idx="results/{sample}/rnaseq/indel/transindel/build.bam.bai"
+ bam="results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam",
+ idx="results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam.bai"
+ message:
+ "Building new BAM file with redefined CIGAR string using transindel build on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
log:
- "logs/transindel/build/{sample}.log"
+ "logs/{sample}/transindel/{replicate}_build.log"
conda:
"../envs/transindel.yml"
shell:
@@ -18,18 +20,20 @@ rule transindel_build:
-i {input.bam} \
-o {output.bam} \
-r resources/refs/genome.fasta \
- -g resources/refs/genome.gtf > {log}
- samtools index {output.bam} -o {output.idx} >> {log}
+ -g resources/refs/genome.gtf > {log} 2>&1
+ samtools index {output.bam} -o {output.idx} >> {log} 2>&1
"""
-rule transindel_call:
+rule detect_long_indel_ti_call:
input:
- bam = "results/{sample}/rnaseq/indel/transindel/build.bam",
- bai = "results/{sample}/rnaseq/indel/transindel/build.bam.bai"
+ bam = "results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam",
+ bai = "results/{sample}/rnaseq/indel/transindel/{replicate}_build.bam.bai"
output:
- "results/{sample}/rnaseq/indel/transindel/call.indel.vcf"
+ "results/{sample}/rnaseq/indel/transindel/{replicate}_call.indel.vcf"
+ message:
+ "Calling short indels using transindel on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
log:
- "logs/transindel/call/{sample}.log"
+ "logs/{sample}/transindel/{replicate}_call.log"
conda:
"../envs/transindel.yml"
params:
@@ -39,59 +43,84 @@ rule transindel_call:
python workflow/scripts/transIndel/transIndel_call.py \
-i {input.bam} \
-l 10 \
- -o results/{wildcards.sample}/rnaseq/indel/transindel/call \
- -m {params}
+ -o results/{wildcards.sample}/rnaseq/indel/transindel/{wildcards.replicate}_call \
+ -m {params} > {log} 2>&1
"""
# resove alleles and remove PCR slippage
-rule slippage_removal:
+rule long_indel_slippage_removal:
input:
- "results/{sample}/rnaseq/indel/transindel/call.indel.vcf"
+ "results/{sample}/rnaseq/indel/transindel/{replicate}_call.indel.vcf"
output:
- "results/{sample}/variants/long.indel.vcf"
+ "results/{sample}/rnaseq/indel/transindel/{replicate}_sliprem.vcf"
+ message:
+ "Resolving alleles and removing PCR slippage using transindel on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
log:
- "logs/indel/sliprem{sample}.log"
+ "logs/{sample}/transindel/{replicate}_sliprem.log"
conda:
"../envs/transindel.yml"
shell:
"""
python3 workflow/scripts/slippage_removal.py \
- resources/refs/genome.fasta {input} {output} > {log}
+ resources/refs/genome.fasta {input} {output} > {log} 2>&1
"""
-rule gatk_mutect2:
+# combines the replicates into one vcf
+rule combine_longindels:
+ input:
+ expand("results/{sample}/rnaseq/indel/transindel/{replicate}_sliprem.vcf",
+ sample=config['data']['name'],
+ replicate=config['data']['rnaseq'].keys())
+ output:
+ "results/{sample}/rnaseq/indel/long.indel.vcf"
+ message:
+ "Combining long indels from replicates on sample:{wildcards.sample}"
+ log:
+ "logs/{sample}/transindel/combine_replicates.log"
+ conda:
+ "../envs/manipulate_vcf.yml"
+ shell:
+ """
+ python workflow/scripts/combine_vcf.py '{input}' {output} > {log} 2>&1
+ """
+
+# detects short somatic variants (SNVs and indels) using mutect2
+rule detect_short_indels_m2:
input:
fasta="resources/refs/genome.fasta",
- map="results/{sample}/rnaseq/align/realigned.bam"
+ map="results/{sample}/rnaseq/align/{replicate}_realigned.bam"
output:
- vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf",
- bam="results/{sample}/rnaseq/indel/mutect2/variants.bam",
+ vcf="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.vcf",
+ bam="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.bam",
message:
- "Detection of somatic SNVs/Indels with Mutect2 on sample {wildcards.sample}"
+ "Detection of somatic SNVs/Indels with Mutect2 on sample:{wildcards.sample} with replicate:{wildcards.replicate}"
threads: config['threads']
resources:
mem_mb=10024,
params:
extra="",
log:
- "logs/gatk/mutect2/{sample}.log",
+ "logs/{sample}/gatk/mutect2/{replicate}.log",
wrapper:
"v1.31.1/bio/gatk/mutect"
-rule gatk_filtermutectcalls:
+# filters short somatic variants (SNVs and indels) using FilterMutectCalls
+rule filter_short_indels:
input:
- vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf",
- bam="results/{sample}/rnaseq/indel/mutect2/variants.bam",
+ vcf="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.vcf",
+ bam="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.bam",
ref="resources/refs/genome.fasta",
# intervals="intervals.bed",
# contamination="", # from gatk CalculateContamination
# segmentation="", # from gatk CalculateContamination
# f1r2="", # from gatk LearnReadOrientationBias
output:
- vcf="results/{sample}/rnaseq/indel/mutect2/variants_flt.vcf"
+ vcf="results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.flt.vcf"
+ message:
+ "Filtering somatic SNVs/Indels with FilterMutectCalls on sample:{wildcards.sample} and replicate:{wildcards.replicate}"
log:
- "logs/gatk/filtermutect/{sample}.log",
+ "logs/{sample}/gatk/filtermutect/{replicate}.log",
params:
extra="--max-alt-allele-count 3",
java_opts="", # optional
@@ -100,15 +129,34 @@ rule gatk_filtermutectcalls:
wrapper:
"v1.31.1/bio/gatk/filtermutectcalls"
-
-rule gatk_select_SNPs:
+rule combine_short_indels_m2:
+ input:
+ expand("results/{sample}/rnaseq/indel/mutect2/{replicate}_variants.flt.vcf",
+ sample=config['data']['name'],
+ replicate=config['data']['rnaseq'].keys())
+ output:
+ "results/{sample}/rnaseq/indel/mutect2/variants.vcf"
+ message:
+ "Combining somatic SNVs/Indels with Mutect2 on sample:{wildcards.sample}"
+ log:
+ "logs/{sample}/transindel/combine_replicates.log"
+ conda:
+ "../envs/manipulate_vcf.yml"
+ shell:
+ """
+ python workflow/scripts/combine_vcf.py '{input}' {output} > {log} 2>&1
+ """
+
+rule select_SNVs_m2:
input:
- vcf="results/{sample}/rnaseq/indel/mutect2/variants_flt.vcf",
+ vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf",
ref="resources/refs/genome.fasta",
output:
- vcf="results/{sample}/variants/snvs.vcf"
+ vcf="results/{sample}/rnaseq/indel/snvs.vcf"
+ message:
+ "Selecting somatic SNVs with SelectVariants on sample:{wildcards.sample}"
log:
- "logs/gatk/select/{sample}.snvs.log",
+ "logs/{sample}/gatk/select/somatic_snvs.log",
params:
extra="--select-type-to-include SNP", # optional filter arguments, see GATK docs
java_opts="", # optional
@@ -117,13 +165,14 @@ rule gatk_select_SNPs:
wrapper:
"v1.31.1/bio/gatk/selectvariants"
-
-rule gatk_select_Indels:
+rule select_short_indels_m2:
input:
- vcf="results/{sample}/rnaseq/indel/mutect2/variants_flt.vcf",
+ vcf="results/{sample}/rnaseq/indel/mutect2/variants.vcf",
ref="resources/refs/genome.fasta",
output:
- vcf="results/{sample}/variants/short.indel.vcf"
+ vcf="results/{sample}/rnaseq/indel/short.indel.vcf"
+ message:
+ "Selecting short somatic indels with SelectVariants on sample:{wildcards.sample}"
log:
"logs/gatk/select/{sample}.indel.log",
params:
@@ -134,85 +183,3 @@ rule gatk_select_Indels:
wrapper:
"v1.31.1/bio/gatk/selectvariants"
-
-
-
-# recalibrate variants (SNP)
-#rule gatk_variant_recal_snp2:
-# input:
-# vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.vcf",
-# ref="resources/refs/genome.fasta",
-# hapmap="resources/vqsr/hapmap_3.3.hg38.vcf.gz",
-# omni="resources/vqsr/1000G_omni2.5.hg38.vcf.gz",
-# g1k="resources/vqsr/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
-# dbsnp="resources/vqsr/dbSNP_b150.vcf.gz",
-
-# output:
-# vcf="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf",
-# idx="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.recal.vcf.idx",
-# tranches="results/{sample}/rnaseq/indel/htcaller/variants.1rd.snp.all.tranches"
-# log:
-# "logs/gatk/variantrecalibrator/{sample}.log",
-
-# params:
-# mode="SNP", # set mode, must be either SNP, INDEL or BOTH
-# resources={
-# "hapmap": {"known": False, "training": True, "truth": True, "prior": 15.0},
-# "omni": {"known": False, "training": True, "truth": True, "prior": 12.0},
-# "g1k": {"known": False, "training": True, "truth": True, "prior": 10.0},
-# "dbsnp": {"known": False, "training": False, "truth": False, "prior": 2.0},
-# },
-# annotation=["MQ", "QD", "MQRankSum", "ReadPosRankSum", "FS", "SOR", "DP"],
-# extra=""
-# threads: config['threads']
-# resources:
-# mem_mb=1024,
-# wrapper:
-# "v1.31.1/bio/gatk/variantrecalibrator"
-
-
-
-
-#rule gatk_applybqsr:
-# input:
-# bam="results/indel/realign/{sample}.bam",
-# ref="resources/refs/genome.fasta",
-# dict="resources/refs/genome.dict",
-# recal_table="recal/{sample}.grp",
-# output:
-# bam="recal/{sample}.bam",
-# log:
-# "logs/gatk/gatk_applybqsr/{sample}.log",
-# params:
-# extra="", # optional
-# java_opts="", # optional
-# embed_ref=True, # embed the reference in cram output
-# resources:
-# mem_mb=1024,
-# wrapper:
-# "v1.31.1/bio/gatk/applybqsr"
-
-
-
-#rule picard_split_vcfs:
-# input:
-# "results/indel/haplotypecaller/{sample}.vcf"
-# output:
-# snp="results/indel/{sample}.snp.vcf",
-# indel="results/indel/{sample}.indel.vcf"
-# log:
-# "logs/spltvcfs{sample}.log"
-# conda:
-# "../envs/picard.yml"
-# shell:
-# """
-# picard SplitVcfs I={input} \
-# SNP_OUTPUT={output.snp} \
-# INDEL_OUTPUT={output.indel}
-# STRICT=false
-# """
-
-
-
-
-
diff --git a/workflow/rules/preproc.smk b/workflow/rules/preproc.smk
index 7efb4c8..5dbace5 100644
--- a/workflow/rules/preproc.smk
+++ b/workflow/rules/preproc.smk
@@ -1,3 +1,160 @@
+rule trimmomatic_SE:
+ input:
+ unpack(get_raw_reads)
+ output:
+ "results/{sample}/rnaseq/preproc/reads.fq.gz"
+ log:
+ "logs/{sample}/trimmomatic.log"
+ params:
+ trimmer=["TRAILING:3"],
+ extra="",
+ compression_level="-9"
+ threads: config['threads']
+ resources:
+ mem_mb=1024
+ wrapper:
+ "v2.1.1/bio/trimmomatic/se"
+
+# if rnaseq_readtype == "PE":
+rule trimmomatic_PE:
+ input:
+ unpack(get_raw_reads)
+ output:
+ r1="results/{sample}/{seqtype}/reads/{replicate}_preproc_r1.fq.gz",
+ r2="results/{sample}/{seqtype}/reads/{replicate}_preproc_r2.fq.gz",
+ r1_unpaired="results/{sample}/{seqtype}/reads/{replicate}_preproc_r1_unpaired.fq.gz",
+ r2_unpaired="results/{sample}/{seqtype}/reads/{replicate}_preproc_r2_unpaired.fq.gz"
+ params:
+ trimmer=[f"MINLEN:{config['preproc']['minlen']}"]
+ + [f"TRAILING:{config['preproc']['trailing']}" if config['preproc']['trailing'] is not None else ""]
+ + [f"LEADING:{config['preproc']['trailing']}" if config['preproc']['leading'] is not None else ""]
+ + [f"SLIDINGWINDOW:{config['preproc']['slidingwindow']['windowsize']}:{config['preproc']['slidingwindow']['quality']}" if config['preproc']['slidingwindow']['activate'] else ""]
+ + [f"ILLUMINACLIP:{config['preproc']['adapters']}:2:30:10" if config['preproc']['adapters'] is not None else ""],
+ extra=""
+ log:
+ "logs/{sample}/trimmomatic/{replicate}_{seqtype}.log"
+ threads: config['threads']
+ resources:
+ mem_mb=1024
+ wrapper:
+ "v2.1.1/bio/trimmomatic/pe"
+
+rule add_rg_fastq_PE:
+ input:
+# r1="results/{sample}/{seqtype}/reads/{replicate}_preproc_r1.fq.gz",
+# r2="results/{sample}/{seqtype}/reads/{replicate}_preproc_r2.fq.gz",
+ unpack(get_reads),
+ output:
+ r1="results/{sample}/{seqtype}/reads/{replicate}_preproc_RG_r1.fq.gz",
+ r2="results/{sample}/{seqtype}/reads/{replicate}_preproc_RG_r2.fq.gz"
+ message:
+ "Adding read group information to fastq files"
+ log:
+ "logs/{sample}/add_rg/{replicate}_{seqtype}.log"
+ conda:
+ "../envs/basic.yml"
+ shell:
+ """
+ bash workflow/scripts/addrgfq.sh {input.r1} > gzip -c - > {output.r1} 2> {log}
+ bash workflow/scripts/addrgfq.sh {input.r2} > gzip -c - > {output.r2} 2> {log}
+ """
+
+
+
+checkpoint splitfastq:
+ input:
+ unpack(get_splitfastq_input)
+ output:
+ directory("results/{sample}/reads/rnaseq/{replicate}/")
+ log:
+ "logs/{sample}/splitfastq/{replicate}.log"
+ conda:
+ "../envs/splitfastq.yml"
+ threads: 0
+ shell:
+ """
+ python workflow/scripts/splitfastq.py '{input}' {output} 20000000
+ """
+
+rule star_align_fastq:
+ input:
+ fq1 = "results/{sample}/reads/rnaseq/{replicate}/r1/reads_{i}.fq.gz",
+ fq2 = "results/{sample}/reads/rnaseq/{replicate}/r2/reads_{i}.fq.gz",
+ idx = "resources/refs/star/",
+ output:
+ aln = "results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.bam",
+ log = "results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.log",
+ sj = "results/{sample}/rnaseq/align/{replicate}/splt/reads_{i}.tab"
+ log:
+ "logs/star_align/{sample}_{replicate}_{i}.log"
+ params:
+ extra="--outSAMtype BAM SortedByCoordinate --chimSegmentMin 10 --chimOutType WithinBAM HardClip --genomeSAindexNbases 10 --outSAMattributes RG --outSAMattrRGline ID:noRG"
+ threads: config['threads']
+ wrapper:
+ "v1.26.0/bio/star/align"
+
+rule merge_alignment_results_fastq:
+ input:
+ aggregate_alignments_fastq
+ output:
+ "results/{sample}/rnaseq/align/{replicate}_aligned.bam",
+ log:
+ "logs/samtools/merge/{sample}_{replicate}.log",
+ params:
+ extra="", # optional additional parameters as string
+ threads: config['threads']
+ wrapper:
+ "v1.32.1/bio/samtools/merge"
+
+rule star_align_pe:
+ input:
+ fq1 = "results/{sample}/rnaseq/reads/fastqfiles/r1/inputreads_{i}.fq.gz",
+ fq2 = "results/{sample}/rnaseq/reads/fastqfiles/r2/inputreads_{i}.fq.gz",
+ idx = "resources/refs/star/",
+ output:
+ aln = "results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.bam",
+ log = "results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.log",
+ sj = "results/{sample}/rnaseq/align/bamfiles/inputreads_{i}.tab"
+ log:
+ "logs/star_align/{sample}_{i}.log"
+ params:
+ extra="--outSAMtype BAM SortedByCoordinate --chimSegmentMin 10 --chimOutType WithinBAM HardClip --genomeSAindexNbases 10 --outSAMattributes RG --outSAMattrRGline ID:noRG"
+ threads: config['threads']
+ wrapper:
+ "v1.26.0/bio/star/align"
+
+rule merge_alignment_results_pe:
+ input:
+ aggregate_alignments_pe
+ output:
+ "results/{sample}/rnaseq/align/aligned.bam",
+ log:
+ "logs/samtools/merge/{sample}.log",
+ params:
+ extra="", # optional additional parameters as string
+ threads: config['threads']
+ wrapper:
+ "v1.32.1/bio/samtools/merge"
+
+
+ #rule align_with_star_fq:
+ #input:
+ #unpack(get_align_input),
+ #idx="resources/refs/star/"
+ #output:
+ ## see STAR manual for additional output files
+ #aln="results/{sample}/rnaseq/align/aligned.bam",
+ #log="logs/{sample}/star/Log1.out",
+ #sj="results/{sample}/rnaseq/align/sj.out.tab"
+ #log:
+ #"logs/{sample}/star/Log.out"
+ #params:
+ #extra="--outSAMtype BAM SortedByCoordinate --chimSegmentMin 10 --chimOutType WithinBAM HardClip --genomeSAindexNbases 10 --outSAMattributes RG --outSAMattrRGline ID:xxx"
+ #threads: config['threads']
+ #wrapper:
+ #"v2.1.1/bio/star/align"
+
+
if rnaseq_filetype == ".bam":
checkpoint split_bamfile_RG:
input:
@@ -17,7 +174,6 @@ if rnaseq_filetype == ".bam":
-h {input} -f {output}/%!.%. {input}
"""
-if rnaseq_filetype == ".bam":
rule bam_to_fastq:
input:
"results/{sample}/rnaseq/reads/bamfiles/{readgroup}.bam"
@@ -34,7 +190,6 @@ if rnaseq_filetype == ".bam":
| samtools fastq -OT RG -@ {threads} - | gzip -c - > {output}
"""
-if rnaseq_filetype == ".bam":
rule align_with_star:
input:
fq1 = "results/{sample}/rnaseq/reads/fastqfiles/{readgroup}.fq.gz",
@@ -51,7 +206,6 @@ if rnaseq_filetype == ".bam":
wrapper:
"v1.26.0/bio/star/align"
-
if rnaseq_filetype == ".bam":
rule merge_alignment_results:
input:
@@ -69,13 +223,13 @@ if rnaseq_filetype == ".bam":
rule samtools_postproc:
input:
- "results/{sample}/rnaseq/align/aligned.bam"
+ "results/{sample}/rnaseq/align/{replicate}_aligned.bam"
output:
- "results/{sample}/rnaseq/align/ready.bam"
+ "results/{sample}/rnaseq/align/{replicate}_ready.bam"
conda:
"../envs/samtools.yml"
log:
- "logs/samtools/postproc/{sample}.log"
+ "logs/samtools/postproc/{sample}_{replicate}.log"
threads: 6 # more threads brings no significant increase
shell:
"""
@@ -89,11 +243,11 @@ rule samtools_postproc:
rule samtools_postproc_index:
input:
- "results/{sample}/rnaseq/align/ready.bam"
+ "results/{sample}/rnaseq/align/{replicate}_ready.bam"
output:
- "results/{sample}/rnaseq/align/ready.bam.bai"
+ "results/{sample}/rnaseq/align/{replicate}_ready.bam.bai"
log:
- "logs/samtools/index/{sample}.log"
+ "logs/samtools/index/postproc_{sample}_{replicate}.log"
params:
extra="", # optional additional parameters as string
threads: config['threads']
@@ -102,31 +256,37 @@ rule samtools_postproc_index:
# retrieve readgroups from bam file
-if rnaseq_filetype == ".bam":
- rule determine_readgroups:
- input:
- get_rnaseq_data
- output:
- "results/{sample}/rnaseq/reads/readgroups.txt"
- log:
- "logs/readgroups/{sample}.log"
- shell:
- """
- python workflow/scripts/get_readgroups.py {input} \
- {output} > {log} 2>&1
- """
+rule get_readgroups:
+ input:
+ get_readgroups_input
+ #"results/{sample}/rnaseq/align/{replicate}_ready.bam"
+ output:
+ "results/{sample}/rnaseq/reads/{replicate}_readgroups.txt"
+ conda:
+ "../envs/basic.yml"
+ log:
+ "logs/{sample}/get_readgroups/{replicate}.log"
+ shell:
+ """
+ python workflow/scripts/get_readgroups.py '{input}' \
+ {output} > {log} 2>&1
+ """
+
+
+
+
rule realign:
input:
- bam="results/{sample}/rnaseq/align/ready.bam",
- rg="results/{sample}/rnaseq/reads/readgroups.txt"
+ bam="results/{sample}/rnaseq/align/{replicate}_ready.bam",
+ rg="results/{sample}/rnaseq/reads/{replicate}_readgroups.txt"
output:
- "results/{sample}/rnaseq/align/realigned.bam"
+ "results/{sample}/rnaseq/align/{replicate}_realigned.bam"
threads: config['threads']
shell:
"""
- samtools collate -Oun128 {input.bam} \
- | samtools fastq -OT RG,BC - \
+ samtools collate -Oun128 {input.bam} \
+ | samtools fastq -OT RG -@ {threads} - \
| bwa mem -pt{threads} -CH <(cat {input.rg}) resources/refs/bwa/genome - \
| samtools sort -@6 -m1g - > {output}
"""
@@ -134,11 +294,11 @@ rule realign:
rule realign_index:
input:
- "results/{sample}/rnaseq/align/realigned.bam"
+ "results/{sample}/rnaseq/align/{replicate}_realigned.bam"
output:
- "results/{sample}/rnaseq/align/realigned.bam.bai"
+ "results/{sample}/rnaseq/align/{replicate}_realigned.bam.bai"
log:
- "logs/samtools/index/{sample}.log"
+ "logs/{sample}/realign_index/{sample}_{replicate}.log"
params:
extra="", # optional additional parameters as string
threads: config['threads']
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 4b7dff2..4c8c120 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -81,3 +81,20 @@ rule bwa_index:
algorithm="bwtsw",
wrapper:
"v1.26.0/bio/bwa/index"
+
+
+rule create_sequence_dictionary:
+ input:
+ "resources/refs/genome.fasta"
+ output:
+ "resources/refs/genome.dict"
+ message:
+ "Create sequence dictionary of reference genome"
+ log:
+ "logs/picard/create_dict.log"
+ params:
+ extra="", # optional: extra arguments for picard.
+ resources:
+ mem_mb=1024,
+ wrapper:
+ "v1.31.1/bio/picard/createsequencedictionary"