From 33c6d83e988d39628b1a5ae3191cc5ccc58f4fce Mon Sep 17 00:00:00 2001 From: daler Date: Wed, 23 Oct 2019 13:50:50 -0400 Subject: [PATCH 1/4] initial draft of wes --- lib/patterns_targets.py | 25 + workflows/WES-somatic/Snakefile | 832 ++++++++++++++++++ .../WES-somatic/config/WES_patterns.yaml | 73 ++ workflows/WES-somatic/config/config.yaml | 25 + workflows/WES-somatic/config/sampletable.tsv | 30 + 5 files changed, 985 insertions(+) create mode 100644 workflows/WES-somatic/Snakefile create mode 100644 workflows/WES-somatic/config/WES_patterns.yaml create mode 100644 workflows/WES-somatic/config/config.yaml create mode 100644 workflows/WES-somatic/config/sampletable.tsv diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py index 7d986b79..f8c34957 100644 --- a/lib/patterns_targets.py +++ b/lib/patterns_targets.py @@ -69,6 +69,31 @@ def __init__(self, config, patterns, workdir=None): self.n = [1] +class WESConfig(SeqConfig): + def __init__(self, config, patterns, workdir=None): + """ + Config object specific to WES workflows. + + Fills in patterns to create targets + + Parameters + ---------- + + config : dict + + patterns : str + Path to patterns YAML file + + workdir : str + Config, patterns, and all paths in `config` should be interpreted + as relative to `workdir` + """ + SeqConfig.__init__(self, config, patterns, workdir) + + self.fill = dict(sample=self.samples, n=self.n) + self.targets = helpers.fill_patterns(self.patterns, self.fill, zip) + self.tumoronly = self.config['tumor_only'] + class RNASeqConfig(SeqConfig): def __init__(self, config, patterns, workdir=None): """ diff --git a/workflows/WES-somatic/Snakefile b/workflows/WES-somatic/Snakefile new file mode 100644 index 00000000..8530d337 --- /dev/null +++ b/workflows/WES-somatic/Snakefile @@ -0,0 +1,832 @@ +import sys + +sys.path.insert(0, srcdir('../..')) +import os +from textwrap import dedent +import yaml +import tempfile +import pandas as pd +from lib import common, cluster_specific, utils, helpers, aligners +from lib.patterns_targets import RNASeqConfig + +if not workflow.overwrite_configfile: + configfile: 'config/config.yaml' +else: + configfile: workflow.overwrite_configfile + +include: '../references/Snakefile' +shell/prefix( + 'set -euo pipefail; export R_PROFILE_USER=; export TMPDIR={}' + .format(cluster_specific.tempdir_for_biowulf()) +) +shell.executable('/bin/bash') + +config = common.load_config(config) + +c = WESconfig(config, config.get('patterns', 'config/WES_patterns.yaml')) + +wildcard_constraints: + n = '[1,2]' + +#sampletable = pd.read_csv('config/sampletable.tsv', sep='\t') +#samples = sampletable.loc[:,'sample_id'] +#configfile: 'config/config.yaml' +#known_sites_files=config['known_sites'] +#genotypes = ['tumor', 'normal'] +#if config['tumor_only']: +# genotypes=['tumor'] + +#prefix = os.path.basename(config['fasta']) + +def render_r1_r2(pattern, r1_only=False): + return expand(pattern, sample='{sample}', n=c.n).sort() + +if 'filename' in ''.join(c.sampletable.columns): + # Convert the sampletable to be indexed by the first column, for + # convenience in generating the input/output filenames. + _st = c.sampletable.set_index(c.sampletable.columns[0]) + def filenames_for_sample(wc): + if c.is_paired: + if c.tumor_only: + return _st.loc[wc.sample, ['tumor_filename', 'tumor_R2_filename']] + else: + return _st.loc[wc.sample, ['normal_filename', 'normal_R2_filename', 'tumor_filename','tumor_R2_filename']] + else: + if c.tumor_only: + return _st.loc[wc.sample, ['tumor_filename']] + else: + return _st.loc[wc.sample, ['normal_filename', 'tumor_filename']] + + rule symlinks: + input: + filenames_for_sample + output: + render_r1_r2(c.patterns['fastq']['combined']) + run: + assert len(output) == len(input), (input, output) + for src, linkname in zip(input, output): + utils.make_relative_symlink(src, linkname) + +rule cutadapt: + input: + fastq = render_r1_r2(c.patterns['fastq']) + output: + fastq = render_r1_r2(c.patterns['cutadapt']) + log: + render_r1_r2(c.patterns['cutadapt'][0] + '.log') + threads: 6 + run: + if c.is_paired: + shell( + 'cutadapt ' + '-o {output[0]}' + '-p {output[1]}' + '-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA ' + '-A AGATCGGAAGAGCGTCGTGTAGGGAAACACTGT ' + '-q 20 ' + '-j {threads} ' + '--minimum-length 25 ' + '{input.fastq[0]} ' + '{input.fastq[1]} ' + '&> {log}" + ) + if not c.tumor_only: + shell( + 'cutadapt ' + '-o {output[2]}' + '-p {output[3]}' + '-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA ' + '-A AGATCGGAAGAGCGTCGTGTAGGGAAACACTGT ' + '-q 20 ' + '-j {threads} ' + '--minimum-length 25 ' + '{input.fastq[2]} ' + '{input.fastq[3]} ' + '&> {log}" + ) + else: + shell( + 'cutadapt ' + '-o {output[0]} ' + '-a AGATCGGAAGAGCACGTCTGAACTCCAGTCA ' + '-q 20 ' + '--minimum-length 25 ' + '{input.fastq[0]} ' + '&> {log}' + ) + if not c.tumor_only: + shell( + 'cutadapt ' + '-o {output[1]} ' + '-a AGATCGGAAGAGCACGTCTGAACTCCAGTCA ' + '-q 20 ' + '--minimum-length 25 ' + '{input.fastq[1]} ' + '&> {log}' + ) + +final_targets = utils.flatten(( + utils.flatten(c.targets['baserecalibration']), + utils.flatten(c.targets['combinecallers']), + utils.flatten(c.targets['fastqc']), + [c.targets['loh']], + [c.targets['intersect']], + utils.flatten(c.targets['genedist']) +)) + +rule targets: + input: final_targets + +rule fastqc: + """ + Run FastQC + """ + input: + 'fastqs/{sample}.{genotype}.R1.cutadapt.fastq.gz' + output: + html='data/{sample}/fastqc/{sample}.{genotype}.cutadapt.fastqc.html', + zip='data/{sample}/fastqc/{sample}.{genotype}.cutadapt.fastqc.zip', + threads: 6 + shell: + 'fastqc ' + '--threads {threads} ' + '--noextract ' + '--quiet ' + '{input} ' + '&& mv fastqs/{wildcards.sample}.{wildcards.genotype}.R1.cutadapt_fastqc.html {output.html} ' + '&& mv fastqs/{wildcards.sample}.{wildcards.genotype}.R1.cutadapt_fastqc.zip {output.zip}' + + +rule bwa_index: + input: + fasta=config['fasta'], + output: + prefix + '.sa', + params: + algorithm='bwtsw', + prefix=prefix + shell: + 'bwa index ' + '{params.prefix} ' + '{params.algorithm}' + + +rule bwa_alignment: + """ + bwa alignment + """ + input: + sa = prefix + '.sa', + R1 = 'fastqs/{sample}.{genotype}.R1.cutadapt.fastq.gz', + R2 = 'fastqs/{sample}.{genotype}.R2.cutadapt.fastq.gz' + output: + bam=temp(c.patterns['bam']) + threads: 6 + params: + index=prefix + run: + sam=output.bam.replace('.bam', '.sam') + gt = wildcards.genotype.upper() + platform=config['sequencing_platform'] + shell( + "bwa mem " + "-t {threads} " + "-R '@RG\\tID:{gt}" + "\\tPL:{platform}" + "\\tLB:{gt}" + "\\tSM:{gt}' " + "{params.index} " + "{input.R1} " + "{input.R2} " + "> {sam}") + + shell( + 'samtools view -Sb {sam} ' + '| samtools sort - -o {output.bam} -O BAM ' + '&& rm {sam}') + +rule index_bams: + input: + c.patterns['bam'] + output: + c.patterns['bam'] + '.bai' + shell: + 'samtools index {input}' + +rule mark_duplicates: + input: + bam=c.patterns['bam'], + index=c.patterns['bam'] + output: + bam=temp(c.patterns['markduplicates']['bam']), + metrics=c.patterns['markduplicates']['metrics'] + shell: + 'gatk MarkDuplicates ' + '-I {input.bam} ' + '-O {output.bam} ' + '--REMOVE_DUPLICATES false ' + '--METRICS_FILE {output.metrics} ' + '--CREATE_INDEX true ' + '--VALIDATION_STRINGENCY LENIENT ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + + +rule index_fasta: + input: + config['fasta'] + output: + config['fasta'] + '.fai' + shell: + 'samtools faidx {input}' + + +rule get_fasta_dict: + input: + config['fasta'] + output: + config['fasta'].rstrip('.fa.gz') + '.dict' + shell: + 'picard CreateSequenceDictionary REFERENCE={input} OUTPUT={output}' + +rule index_population_variant_file: + input: + config['population_variant_file'] + output: + config['population_variant_file'] + '.tbi' + shell: + 'gatk IndexFeatureFile ' + '--feature-file {input} ' + '> {output}' + + +rule index_known_snp_files: + # NOTE: the known snp files need to be bgzipped rather than gzipped for this rule to work + input: + '{known_sites_file}' + output: + '{known_sites_file}.tbi' + shell: + 'gatk IndexFeatureFile -F {input}' + +def get_known_sites_str(): + known_sites_string='' + for filename in config['known_sites']: + known_sites_string += '--known-sites ' + filename + ' ' + return known_sites_string + +rule base_recalibrator_model: + """ + Recalibrating the bases corrects systematic base scoring errors that the + sequencing instrument algorithms may generate in particular regions such as + homopolymer runs. GATK's model uses machine learning and works in two + steps: one to build the model and one to apply it + (apply_base_recalibration) + """ + + input: + fa_index=config['fasta'] + '.fai', + fa_dict=config['fasta'].rstrip('.fa.gz') + '.dict', + bam=c.patterns['markduplicates']['bam'], + fasta=config['fasta'], + known_sites_files=expand('{known_sites_file}', known_sites_file=known_sites_files), + known_sites_index=config['known_sites'][0] + '.tbi' + output: + c.patterns['baserecalibration']['bqsrtable'] + params: + known_sites_string=get_known_sites_str() + shell: + 'gatk BaseRecalibrator ' + '-I {input.bam} ' + '-O {output} ' + '-R {input.fasta} ' + '{params.known_sites_string}' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + + +rule apply_base_recalibration: + input: + bam=c.patterns['markduplicates']['bam'], + fasta=config['fasta'], + bqsr_table=c.patterns['baserecalibration']['bqsrtable'] + output: + c.patterns['baserecalibration']['bam'] + shell: + 'gatk ApplyBQSR ' + '-I {input.bam} ' + '-O {output} ' + '-R {input.fasta} ' + '--bqsr-recal-file {input.bqsr_table} ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + +rule index_bams_2: + input: + bam=c.patterns['baserecalibration']['bam'], + old_index=c.patterns['baserecalibration']['bam'].replace('.bam', '.bai') + output: + c.patterns['baserecalibration']['bam'] + '.bai' + shell: + 'samtools index {input.bam}' + '&& rm {input.old_index}' + +rule get_pileup_summaries: + """ + This is a precursor step to filtering mutect2 calls later on + """ + input: + bam=c.patterns['baserecalibration']['bam'], + index = c.patterns['baserecalibration']['bam'] + '.bai', + PV=config['population_variant_file'], + exome_capture=config['exome_capture'] + output: + c.patterns['pileupsummaries'] + shell: + 'gatk GetPileupSummaries ' + '-L {input.exome_capture} ' + '-I {input.bam} ' + '-O {output} ' + '--variant {input.PV} ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + +if config['tumor_only']: + rule tumoronly_mutect2: + input: + tumor=c.patterns['baserecalibration']['tumor'], + fasta=config["fasta"], + index=c.patterns['baserecalibration']['tumor'] + '.bai', + # PoN=PoN, + exome_capture=config['exome_capture'], + PV=config['population_variant_file'], + PVI=config['population_variant_file'] + '.tbi' + output: + temp(c.patterns['mutect2']['snp']) + run: + if config['tumor_only']: + shell( + 'gatk Mutect2 ' + '-R {input.fasta} ' + '-I {input.tumor} ' + '--intervals {input.exome_capture} ' + '-O {output} ' + '--germline-resource {input.PV} ' + # '--panel-of-normals {input.PoN} ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + ) + + rule tumoronly_calculate_contamination: + input: + tumor=c.patterns['pileupsummaries']['tumor'], + output: + c.patterns['mutect2']['contamination'] + shell: + 'gatk CalculateContamination ' + '-I {input.tumor} ' + '-O {output} ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + + +else: + rule samtools_mpileup: + input: + normal=c.patterns['baserecalibration']['normal'], + tumor=c.patterns['baserecalibration']['tumor'], + fasta=config['fasta'] + output: + temp(c.patterns['varscan']['samtoolsmpileup']) + shell: + 'samtools mpileup ' + '-q 1 ' + '-d 5000 ' + '-f {input.fasta} ' + '{input.normal} ' + '{input.tumor} ' + '> {output}' + + + rule varscan: + input: + pileup=c.patterns['varscan']['samtoolsmpileup'] + output: + snp=c.patterns['varscan']['snp'], + indel=c.patterns['varscan']['indel'] + shell: + 'varscan somatic ' + '{input.pileup} ' + 'data/{wildcards.sample}/{wildcards.sample}.varscan ' + '--mpileup 1 ' + '--output-vcf 1 ' + '--min-var-freq 0.10' + + + rule configure_strelka: + input: + normal=c.patterns['baserecalibration']['normal'], + tumor=c.patterns['baserecalibration']['tumor'], + fasta=config['fasta'] + output: + c.patterns['strelka']['script'] + conda: + 'envs/strelka.yml' + shell: + 'configureStrelkaSomaticWorkflow.py ' + '--normalBam={input.normal} ' + '--tumorBam={input.tumor} ' + '--referenceFasta={input.fasta} ' + '--runDir data/{wildcards.sample}/' + + + rule run_strelka: + input: + c.patterns['strelka']['script'] + output: + snp=c.patterns['strelka']['snp'], + indel=c.patterns['strelka']['indel'] + conda: + 'envs/strelka.yml' + shell: + '{input} ' + '-m local ' + '&& mv data/{wildcards.sample}/results/variants/somatic.snvs.vcf.gz {output.snp} ' + '&& mv data/{wildcards.sample}/results/variants/somatic.indels.vcf.gz {output.indel}' + + rule get_strelka_passedonly: + input: + snp=c.patterns['strelka']['snp'], + indel=c.patterns['strelka']['indel'] + output: + snp=c.patterns['strelka']['passedonlysnp'], + indel=c.patterns['strelka']['passedonlyindel'], + run: + shell( + 'bcftools view -i "FILTER=\'PASS\'" ' + '{input.snp} ' + '> {output.snp}') + shell( + 'bcftools view -i "FILTER=\'PASS\'" ' + '{input.indel} ' + '> {output.indel}') + + + rule somatic_sniper: + input: + normal=c.patterns['baserecalibration']['normal'], + tumor=c.patterns['baserecalibration']['tumor'], + fasta=config['fasta'] + output: + c.patterns['somaticsniper']['snp'] + shell: + 'bam-somaticsniper ' + '-F vcf ' + '-f {input.fasta} ' + '{input.tumor} ' + '{input.normal} ' + '{output}' + + rule mutect2: + input: + tumor=c.patterns['baserecalibration']['tumor'], + tumor_index=c.patterns['baserecalibration']['tumor'] + '.bai', + normal=c.patterns['baserecalibration']['normal'], + normal_index=c.patterns['baserecalibration']['normal'] + '.bai', + fasta=config["fasta"], + # PoN=PoN, + exome_capture=config['exome_capture'], + PV=config['population_variant_file'], + PVI=config['population_variant_file'] + '.tbi' + output: + c.patterns['mutect2']['snp'] + shell: + 'gatk Mutect2 ' + '-R {input.fasta} ' + '-I {input.tumor} ' + '-I {input.normal} ' + '--intervals {input.exome_capture} ' + '-O {output} ' + '--germline-resource {input.PV} ' + # '--panel-of-normals {input.PoN} ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + + + rule calculate_contamination: + """ + Estimates the proportion of reads originating from other samples + """ + input: + tumor=c.patterns['pileupsummaries']['tumor'], + normal=c.patterns['pileupsummaries']['normal'] + output: + c.patterns['mutect2']['contamination'] + shell: + 'gatk CalculateContamination ' + '-I {input.tumor} ' + '-O {output} ' + '--matched-normal {input.normal} ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + +rule filter_out_contaminants: + """ + Filters out reads estimated to have originated from other samples + """ + input: + vcf=c.patterns['mutect2']['snp'], + contaminants=c.patterns['mutect2']['contamination'], + reference=config['fasta'] + output: + temp(c.patterns['mutect2']['filtered']) + shell: + 'gatk FilterMutectCalls ' + '--variant {input.vcf} ' + '--reference {input.reference} ' + '-O {output} ' + '--contamination-table {input.contaminants} ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + + +rule collect_sequencing_artifact_metrics: + """ + Measures pre-adapter errors to allow for filtering by orientation bias + """ + input: + bam=c.patterns['baserecalibration']['tumor'], + fasta=config['fasta'], + exome_capture=config['exome_capture'] + output: + c.patterns['mutect2']['sequencingmetrics'] + shell: + 'gatk CollectSequencingArtifactMetrics ' + '--INTERVALS {input.exome_capture} ' + '-I {input.bam} ' + '-O data/{wildcards.sample}/{wildcards.sample}.seqartifactmetrics ' + '-R {input.fasta} ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + + +rule filter_by_orientation_bias: + """ + Filter by orientation bias, which catches artifacts formed by chemical + changes that occur in the DNA on a single strand during sample prep. + """ + input: + vcf=c.patterns['mutect2']['filtered'], + detail_file=c.patterns['mutect2']['sequencingmetrics'] + output: + c.patterns['mutect2']['twicefiltered'] + shell: + 'gatk FilterByOrientationBias ' + '--variant {input.vcf} ' + '--pre-adapter-detail-file {input.detail_file} ' + '-O {output} ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + + +rule get_mutect2_passed_only_variants: + """ + Calls that didn't pass all of the filters will still be in the output vcf, + they will just be flagged with the filters they didn't pass in the + 'FILTER' field, so this rule pulls out all of the calls that passed the + filters + """ + input: + c.patterns['mutect2']['twicefiltered'] + output: + c.patterns['mutect2']['passedonly'] + run: + shell('bcftools view -i "FILTER=\'PASS\'" ' + '{input} ' + '> {output}') + + +if not config['tumor_only']: + # the varscan and somatic sniper vcfs have a common header line in slightly + # different formats which is messing with the merging, so this rule creates + # a varscan vcf with that header line replaced with the somatic sniper version. + rule adjust_varscan_heading: + input: + varscan=c.patterns['varscan']['snp'], + somaticsniper=c.patterns['somaticsniper']['snp'] + output: + temp(c.patterns['varscan']['headingadjust']) + shell: + 'DP4_line=$(grep ' + '\"FORMAT= {output} ' + + rule remove_ambiguous_calls: + # varscan and somatic sniper will include ambiguous calls (e.g. ref + # column entry is 'W'). This rule removes those calls, as they cause + # issues in the 'combine_variants' rule + input: + varscan=c.patterns['varscan']['headingadjust'], + somaticsniper=c.patterns['somaticsniper']['snp'] + output: + varscan=c.patterns['varscan']['ambigremoved'], + somaticsniper=c.patterns['somaticsniper']['ambigremoved'] + shell: + 'awk "\$1 ~ /^#/ {{print \$0;next}} ' + '{{if (\$4 ~ /A|C|T|G/ ' + '&& \$5 ~ /A|C|T|G/) ' + 'print \$0}}" ' + '{input.varscan} ' + '> {output.varscan} ' + '&& awk "\$1 ~ /^#/ {{print \$0;next}} ' + '{{if (\$4 ~ /A|C|T|G/ ' + '&& \$5 ~ /A|C|T|G/) ' + 'print \$0}}" ' + '{input.somaticsniper} ' + '> {output.somaticsniper} ' + + + rule bgzip_vcfs: + input: + mutect2=c.patterns['mutect2']['passedonly'], + varscan=c.patterns['varscan']['ambigremoved'], + strelka=c.patterns['strelka']['passedonlysnp'], + somaticsniper= c.patterns['somaticsniper']['ambigremoved'] + output: + mutect2=c.patterns['mutect2']['bgzipped'], + mutect2_tbi=c.patterns['mutect2']['bgzipped'] + '.tbi', + varscan=c.patterns['varscan']['bgzipped'], + varscan_tbi=c.patterns['varscan']['bgzipped'] + '.tbi', + strelka=c.patterns['strelka']['bgzipped'], + strelka_tbi=c.patterns['strelka']['bgzipped'] + '.tbi', + somaticsniper= c.patterns['strelka']['bgzipped'], + somaticsniper_tbi=c.patterns['strelka']['bgzipped'] + '.tbi' + shell: + 'bgzip {input.mutect2} ' + '&& tabix -p vcf {output.mutect2} ' + '&& bgzip {input.varscan} ' + '&& tabix -p vcf {output.varscan} ' + '&& bgzip {input.strelka} ' + '&& tabix -p vcf {output.strelka} ' + '&& bgzip {input.somaticsniper} ' + '&& tabix -p vcf {output.somaticsniper}' + + rule combine_variants: + input: + mutect2=c.patterns['mutect2']['bgzipped'], + varscan=c.patterns['varscan']['bgzipped'], + strelka=c.patterns['strelka']['bgzipped'], + somaticsniper= c.patterns['strelka']['bgzipped'], + fasta=config['fasta'] + output: + c.patterns['combinecallers']['gatk'] + conda: + 'envs/gatk3.yml' + shell: + 'GenomeAnalysisTK ' + '-Xmx4g ' + '-T CombineVariants ' + '-R {input.fasta} ' + '--variant:mutect2 {input.mutect2} ' + '--variant:varscan {input.varscan} ' + '--variant:strelka {input.strelka} ' + '--variant:somaticsniper {input.somaticsniper} ' + '-o {output} ' + '-genotypeMergeOptions PRIORITIZE ' + '-priority mutect2,varscan,strelka,somaticsniper' + '|| gatk3-register GenomeAnalysisTK-3.8-0-ge9d806836.tar.bz2 ' + '&& GenomeAnalysisTK ' + '-Xmx4g ' + '-T CombineVariants ' + '-R {input.fasta} ' + '--variant:mutect2 {input.mutect2} ' + '--variant:varscan {input.varscan} ' + '--variant:strelka {input.strelka} ' + '--variant:somaticsniper {input.somaticsniper} ' + '-o {output} ' + '-genotypeMergeOptions PRIORITIZE ' + '-priority mutect2,varscan,strelka,somaticsniper' + + rule vcftools_merge: + input: + mutect2=c.patterns['mutect2']['bgzipped'], + varscan=c.patterns['varscan']['bgzipped'], + strelka=c.patterns['strelka']['bgzipped'], + somaticsniper= c.patterns['strelka']['bgzipped'] + output: + c.patterns['combinecallers']['vcftools'] + shell: + 'vcftools merge ' + '{input} ' + '> {output}' + + +rule snpeff_annotation: + """ + Add gene name and id and structural/functional predictions to the vcf + """ + input: + vcf=lambda wildcards: (c.patterns['mutect2']['passedonly'] + if config['tumor_only'] + else c.patterns['combinecallers']['gatk']) + output: + stats=c.patterns['snpeff']['stats'], + vcf=c.patterns['snpeff']['vcf'] + params: + genome=config['genome'] + shell: + 'snpEff -Xmx4g ' + '-v ' + '-stats {output.stats} ' + '{params.genome} ' + '{input} ' + ' > {output.vcf}' + +rule snpsift_dbnsfp_data: + input: + dbNSFP=config['dbNSFP_file'], + dbNSFP_index=config['dbNSFP_file'], + vcf=c.patterns['snpeff']['vcf'] + output: + c.patterns['dbnsfp'] + shell: + 'SnpSift dbnsfp ' + '-db {input.dbNSFP} ' + '-f 1000Gp1_AF,' + 'clinvar_rs,' + 'ExAC_AF,' + 'clinvar_clnsig ' + '{input.vcf} ' + '> {output}' + + +rule pull_LOH_calls: + """ + Varscan and Somatic Sniper call LOH (loss of heterozygosity) in the 'SS' + entry of their info fields. The SS entries correspond to the following + types of calls: 0=reference 1=germline 2=somatic 3=LOH. This rule pulls out + all the LOH calls into their own file + """ + input: + c.patterns['dbnsfp'] + output: + c.patterns['loh'] + shell: + 'grep -E "(#|SS=3)" {input} > {output}' + +rule pull_intersect_calls: + """ + This rule pulls out calls that were called by all variant callers and puts them into their own file + """ + input: + c.patterns['dbnsfp'] + output: + c.patterns['intersect'] + shell: + 'grep -E "(#|set=Intersection)" {input} > {output}' + +rule get_bed_from_gtf: + input: + config['gtf'] + output: + 'bed_from_gtf.bed' + shell: + 'grep -w "gene" {input} | cut -f 1,4,5 > {output}' + +rule filter_out_singlecalled: + "Filters out calls made by only one variant caller" + input: + c.patterns['dbnsfp'] + output: + c.patterns['nosinglecalled'] + script: + 'filter-out-singlecalled.py' + + +rule pull_somatic_calls: + "pulls out calls deemed 'somatic' by the callers" + input: + c.patterns['nosinglecalled'] + output: + c.patterns['somaticonly'] + shell: + 'grep -E "(#|SOMATIC)" {input} > {output}' + +rule intersect_gtf_bed_and_vcf: + input: + gtf_bed='bed_from_gtf.bed', + vcf=c.patterns['nosinglecalled'], + somaticonly_vcf=c.patterns['somaticonly'] + output: + bed=c.patterns['genedist']['bed'], + somaticonly_bed =c.patterns['genedist']['somaticonlybed'] + shell: + 'bedtools intersect -c -a {input.gtf_bed} -b {input.vcf} > {output.bed}' + '&& bedtools intersect -c -a {input.gtf_bed} -b {input.somaticonly_vcf} > {output.somaticonly_bed}' + +rule graph_variant_distribution_across_genes: + input: + c.patterns['genedist']['bed'], + c.patterns['genedist']['somaticonlybed'] + output: + c.patterns['genedist']['histogram'], + c.patterns['genedist']['somaticonlyhistogram'] + script: + 'graph_snp_dist.py' + + + diff --git a/workflows/WES-somatic/config/WES_patterns.yaml b/workflows/WES-somatic/config/WES_patterns.yaml new file mode 100644 index 00000000..4b2b5482 --- /dev/null +++ b/workflows/WES-somatic/config/WES_patterns.yaml @@ -0,0 +1,73 @@ +fastq: + combined: 'data/{sample}/{sample}.{genotype}.r{n}.fastq.gz' + tumor: 'data/{sample}/{sample}.tumor.r{n}.fastq.gz' + normal: 'data/{sample}/{sample}.normal.r{n}.fastq.gz' +cutadapt: 'data/{sample}/{sample}.{genotype}.r{n}.cutadapt.fastq.gz' +bam: 'data/{sample}/{sample}.{genotype}.cutadapt.bam' +fastqc: + cutadapt: 'data//{sample}/fastqc/{sample}.{genotype}.R1.cutadapt.fastq.gz_fastqc.zip' + bam: 'data/{sample}/fastqc/{sample}.{genotype}.cutadapt.bam_fastqc.zip' +#fastq_screen: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.screen.txt' +#featurecounts: 'data/rnaseq_aggregation/featurecounts.txt' +#multiqc: 'data/rnaseq_aggregation/multiqc.html' +markduplicates: + bam: 'data/{sample}/{sample}.{genotype}.cutadapt.markdups.bam' + metrics: 'data/{sample}/{sample}.{genotype}.cutadapt.markdups.bam.metrics' +baserecalibration: + bqsrtable: 'data/{sample}/{sample}.{genotype}.bqsr.table' + bam: 'data/{sample}/{sample}.{genotype}.cutadapt.markdups.recal.bam' + tumor: 'data/{sample}/{sample}.tumor.cutadapt.markdups.recal.bam' + normal: 'data/{sample}/{sample}.normal.cutadapt.markdups.recal.bam' +pileupsummaries: + bam: 'data/{sample}/{sample}.{genotype}.pileups.table' + tumor: 'data/{sample}/{sample}.tumor.pileups.table' + normal: 'data/{sample}/{sample}.normal.pileups.table' +varscan: + samtoolsmpileup: 'data/varscan/{sample}/{sample}.pileup' + snp: 'data/{sample}/{sample}.varscan.snp.vcf' + indel: 'data/{sample}/{sample}.varscan.indel.vcf' + headingadjust: 'data/{sample}/{sample}.varscan.heading_adj.snp.vcf' + ambigremoved: 'data/{sample}/{sample}.varscan.heading_adj.ambig_removed.vcf' + bgzipped: 'data/{sample}/{sample}.varscan.heading_adj.ambig_removed.vcf.gz' +strelka: + script: 'data/{sample}/runWorkflow.py' + snp: 'data/{sample}/{sample}.strelka.snp.vcf' + indel: 'data/{sample}/{sample}.strelka.indel.vcf' + passedonlysnp: 'data/{sample}/{sample}.strelka.passedonly.snp.vcf' + passedonlyindel: 'data/{sample}/{sample}.strelka.passedonly.indel.vcf' + bgzipped: 'data/{sample}/{sample}.strelka.passedonly.snp.vcf.gz' +somaticsniper: + snp: 'data/{sample}/{sample}.somaticsniper.vcf' + ambigremoved: 'data/{sample}/{sample}.somaticsniper.ambig_removed.vcf' + bgzipped: 'data/{sample}/{sample}.somaticsniper.ambig_removed.vcf' +mutect2: + snp: 'data/{sample}/{sample}.mutect2.vcf' + contamination: 'data/{sample}/{sample}.contamination.table' + filtered: 'data/{sample}/{sample}.mutect2.oncefiltered.vcf' + sequencingmetrics: 'data/{sample}/{sample}.seqartifactmetrics.pre_adapter_detail_metrics' + twicefiltered: 'data/{sample}/{sample}.mutect2.twicefiltered.vcf' + passedonly: 'data/{sample}/{sample}.mutect2.passedonly.vcf' + bgzipped: 'data/{sample}/{sample}/mutect2.passedonly.vcf.gz' +combinecallers: + gatk: 'data/{sample}/{sample}.merged-callers.vcf.gz' + vcftools: 'data/{sample}/sample}.vcftools-merged-callers.vcf' +snpeff: + stats: 'data/{sample}/{sample}.snpeff_stats.html' + vcf: 'data/{sample}/{sample}.snpeff.vcf' +dbnsfp: 'data/{sample}/{sample}.snpeff.dbnsfp.vcf' +loh: 'data/{sample}/{sample}.snpeff.dbnsfp.LOH_only.vcf' +intersect: 'data/{sample}/{sample}.snpeff.dbnsfp.intersect_only.vcf' +nosinglecalled: 'data/{sample}/{sample}.snpeff.dbnsfp.no_singlecalled.vcf' +somaticonly: 'data/{sample}/{sample}.snpeff.dbnsfp.no_singlecalled.somaticonly.vcf' +genedist: + bed: 'data/{sample}/{sample}.variants-per-gene.bed' + somaticonlybed: 'data/{sample}/{sample}.somatic-variants-per-gene.bed' + histogram: 'data/{sample}/{sample}.histogram_of_variant-gene_distribution.png' + somaticonlyhistogram: 'data/{sample}/{sample}.histogram_of_somatic_variant-gene_distribution.png' + + + + + + + diff --git a/workflows/WES-somatic/config/config.yaml b/workflows/WES-somatic/config/config.yaml new file mode 100644 index 00000000..35ed40e7 --- /dev/null +++ b/workflows/WES-somatic/config/config.yaml @@ -0,0 +1,25 @@ +paired_end: True + +genome: 'hg38' + +fasta: 'hg38.fa' + +# Vcf of known variants. Can list multiple files if needed +known_sites: + - 'common_all_20170710.vcf.gz' + +panel_of_normals: 'None' + +exome_capture: 'S07604514_Covered.interval_list' + +population_variant_file: 'somatic-hg38_af-only-gnomad.hg38.vcf.gz' + +dbNSFP_file: 'dbNSFP2.9.txt.gz' + +gtf: '/data/NICHD-core0/references/human/gencode-v28/gtf/human_gencode-v28.gtf' + +sequencing_platform: 'Illumina' + +tumor_only: False + + diff --git a/workflows/WES-somatic/config/sampletable.tsv b/workflows/WES-somatic/config/sampletable.tsv new file mode 100644 index 00000000..8c90edfc --- /dev/null +++ b/workflows/WES-somatic/config/sampletable.tsv @@ -0,0 +1,30 @@ +sample_id tumor_filename normal_filename tumor_R2_filename normal_R2_filename +A27LT10a_vs_A27LT7a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz +A27LT10a_vs_A27RT18a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz +A27LT10a_vs_A27RT19a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz +A27LT7a_vs_A27LT10a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz +A27LT7a_vs_A27RT18a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz +A27LT7a_vs_A27RT19a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz +A27RT18a_vs_A27LT7a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz +A27RT18a_vs_A27LT10a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz +A27RT18a_vs_A27RT19a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz +A27RT19a_vs_A27LT7a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz +A27RT19a_vs_A27LT10a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz +A27RT19a_vs_A27RT18a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz +A368LT2a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz +A368LT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz +A368RT22a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz +A368RT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz +A368LT2a_vs_A368LT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz +A368LT2a_vs_A368RT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz +A368LT2a_vs_A368RT22a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz +A368LT3a_vs_A368LT2a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz +A368LT3a_vs_A368RT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz +A368LT3a_vs_A368RT22a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz +A368RT22a_vs_A368LT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz +A368RT22a_vs_A368LT2a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz +A368RT22a_vs_A368RT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz +A368RT3a_vs_A368LT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz +A368RT3a_vs_A368LT2a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz +A368RT3a_vs_A368RT22a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz + From b63b36ef2088d191e2ab39f20f4e086526acfc9c Mon Sep 17 00:00:00 2001 From: Sydney Rose Hertafeld Date: Wed, 6 Nov 2019 12:40:17 -0500 Subject: [PATCH 2/4] WES pipeline initial commit --- lib/common.py | 19 ++ lib/patterns_targets.py | 10 +- requirements.txt | 7 + workflows/WES-somatic/Snakefile | 228 ++++++++---------- .../WES-somatic/config/WES_patterns.yaml | 14 +- workflows/WES-somatic/config/config.yaml | 16 +- workflows/WES-somatic/config/sampletable.tsv | 34 +-- 7 files changed, 148 insertions(+), 180 deletions(-) diff --git a/lib/common.py b/lib/common.py index 2e223346..358e6b26 100644 --- a/lib/common.py +++ b/lib/common.py @@ -677,6 +677,25 @@ def is_paired_end(sampletable, sample): pass return False +def is_tumor_only(sampletable): + """ + For somatic WES pipeline, inspects the sampletable to see if there are only + tumor files rather than paired tumor/normal. Assumes the presence of + a column named 'normal_filename' indicates that there are paired normal + samples for all tumor samples. Does not support mixing tumor/normal and + tumor only samples in the same sampletable. + + Parameters + ---------- + sampletable : pandas.DataFrame + Only contains columns called 'normal_filename' and (if paired end) + 'normal_R2_filename' if there are paired normal samples for every tumor + sample. Does not support blank/NA 'normal_filename' columns. + """ + if 'normal_filename' in sampletable.columns: + return False + return True + def fill_r1_r2(sampletable, pattern, r1_only=False): """ diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py index f8c34957..c7bab263 100644 --- a/lib/patterns_targets.py +++ b/lib/patterns_targets.py @@ -89,10 +89,14 @@ def __init__(self, config, patterns, workdir=None): as relative to `workdir` """ SeqConfig.__init__(self, config, patterns, workdir) - - self.fill = dict(sample=self.samples, n=self.n) + self.tumoronly = common.is_tumor_only(self.sampletable) + if self.tumoronly: + self.genotype = ['tumor'] + else: + self.genotype = ['tumor', 'normal'] + self.fill = dict(sample=self.samples, genotype=self.genotype, n=self.n) self.targets = helpers.fill_patterns(self.patterns, self.fill, zip) - self.tumoronly = self.config['tumor_only'] + class RNASeqConfig(SeqConfig): def __init__(self, config, patterns, workdir=None): diff --git a/requirements.txt b/requirements.txt index 56dca33f..abb9cc64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ atropos +bcftools bedtools bioconductor-annotationhub bioconductor-biocparallel @@ -14,6 +15,7 @@ bioconductor-sva bioconductor-tximport biopython >=1.68 bowtie2 +bwa cutadapt deeptools >=3.0.1 fastqc @@ -23,6 +25,7 @@ fastq-screen font-ttf-dejavu-sans-mono gat +gatk4 gffutils >=0.8.7.1 ghostscript git @@ -69,6 +72,8 @@ samtools >=1.4.1 scipy >=0.18.1 seaborn >=0.7.1 snakemake==5.5.4 +snpeff +somatic-sniper sra-tools star subread @@ -84,3 +89,5 @@ ucsc-liftover ucsc-oligomatch ucsc-twobittofa ucsc-wigtobigwig +varscan +vcftools diff --git a/workflows/WES-somatic/Snakefile b/workflows/WES-somatic/Snakefile index 8530d337..757d482b 100644 --- a/workflows/WES-somatic/Snakefile +++ b/workflows/WES-somatic/Snakefile @@ -7,7 +7,7 @@ import yaml import tempfile import pandas as pd from lib import common, cluster_specific, utils, helpers, aligners -from lib.patterns_targets import RNASeqConfig +from lib.patterns_targets import WESConfig if not workflow.overwrite_configfile: configfile: 'config/config.yaml' @@ -15,7 +15,7 @@ else: configfile: workflow.overwrite_configfile include: '../references/Snakefile' -shell/prefix( +shell.prefix( 'set -euo pipefail; export R_PROFILE_USER=; export TMPDIR={}' .format(cluster_specific.tempdir_for_biowulf()) ) @@ -23,23 +23,33 @@ shell.executable('/bin/bash') config = common.load_config(config) -c = WESconfig(config, config.get('patterns', 'config/WES_patterns.yaml')) +c = WESConfig(config, config.get('patterns', 'config/WES_patterns.yaml')) wildcard_constraints: n = '[1,2]' -#sampletable = pd.read_csv('config/sampletable.tsv', sep='\t') -#samples = sampletable.loc[:,'sample_id'] -#configfile: 'config/config.yaml' -#known_sites_files=config['known_sites'] -#genotypes = ['tumor', 'normal'] -#if config['tumor_only']: -# genotypes=['tumor'] +def wrapper_for(path): + return 'file:' + os.path.join('../..','wrappers','wrappers',path) -#prefix = os.path.basename(config['fasta']) +known_sites_files=config['known_sites'] + +fasta_prefix = os.path.basename(config['fasta']) + +final_targets = utils.flatten(( + utils.flatten(c.targets['baserecalibration']), + utils.flatten(c.targets['combinecallers']), + utils.flatten(c.targets['fastqc']), + [c.targets['loh']], + [c.targets['intersect']], + utils.flatten(c.targets['genedist']) +)) + +rule targets: + input: final_targets def render_r1_r2(pattern, r1_only=False): - return expand(pattern, sample='{sample}', n=c.n).sort() + return sorted(expand(pattern, sample='{sample}', genotype='{genotype}', n=c.n)) + if 'filename' in ''.join(c.sampletable.columns): # Convert the sampletable to be indexed by the first column, for @@ -47,21 +57,15 @@ if 'filename' in ''.join(c.sampletable.columns): _st = c.sampletable.set_index(c.sampletable.columns[0]) def filenames_for_sample(wc): if c.is_paired: - if c.tumor_only: - return _st.loc[wc.sample, ['tumor_filename', 'tumor_R2_filename']] - else: - return _st.loc[wc.sample, ['normal_filename', 'normal_R2_filename', 'tumor_filename','tumor_R2_filename']] + return _st.loc[wc.sample, [wc.genotype + '_filename', wc.genotype + '_R2_filename']] else: - if c.tumor_only: - return _st.loc[wc.sample, ['tumor_filename']] - else: - return _st.loc[wc.sample, ['normal_filename', 'tumor_filename']] - + return _st.loc[wc.sample, [wc.genotype + '_filename']] + rule symlinks: input: filenames_for_sample output: - render_r1_r2(c.patterns['fastq']['combined']) + render_r1_r2(c.patterns['fastq']) run: assert len(output) == len(input), (input, output) for src, linkname in zip(input, output): @@ -69,41 +73,27 @@ if 'filename' in ''.join(c.sampletable.columns): rule cutadapt: input: - fastq = render_r1_r2(c.patterns['fastq']) + render_r1_r2(c.patterns['fastq']) output: - fastq = render_r1_r2(c.patterns['cutadapt']) - log: - render_r1_r2(c.patterns['cutadapt'][0] + '.log') + render_r1_r2(c.patterns['cutadapt']) +# log: +# render_r1_r2(c.patterns['cutadapt'][0] + '.log') threads: 6 run: if c.is_paired: shell( 'cutadapt ' - '-o {output[0]}' - '-p {output[1]}' + '-o {output[0]} ' + '-p {output[1]} ' '-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA ' - '-A AGATCGGAAGAGCGTCGTGTAGGGAAACACTGT ' + '-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT ' '-q 20 ' '-j {threads} ' '--minimum-length 25 ' - '{input.fastq[0]} ' - '{input.fastq[1]} ' - '&> {log}" + '{input[0]} ' + '{input[1]} ' +# '&> {log}' ) - if not c.tumor_only: - shell( - 'cutadapt ' - '-o {output[2]}' - '-p {output[3]}' - '-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA ' - '-A AGATCGGAAGAGCGTCGTGTAGGGAAACACTGT ' - '-q 20 ' - '-j {threads} ' - '--minimum-length 25 ' - '{input.fastq[2]} ' - '{input.fastq[3]} ' - '&> {log}" - ) else: shell( 'cutadapt ' @@ -111,60 +101,29 @@ rule cutadapt: '-a AGATCGGAAGAGCACGTCTGAACTCCAGTCA ' '-q 20 ' '--minimum-length 25 ' - '{input.fastq[0]} ' - '&> {log}' + '{input[0]} ' +# '&> {log}' ) - if not c.tumor_only: - shell( - 'cutadapt ' - '-o {output[1]} ' - '-a AGATCGGAAGAGCACGTCTGAACTCCAGTCA ' - '-q 20 ' - '--minimum-length 25 ' - '{input.fastq[1]} ' - '&> {log}' - ) - -final_targets = utils.flatten(( - utils.flatten(c.targets['baserecalibration']), - utils.flatten(c.targets['combinecallers']), - utils.flatten(c.targets['fastqc']), - [c.targets['loh']], - [c.targets['intersect']], - utils.flatten(c.targets['genedist']) -)) - -rule targets: - input: final_targets - rule fastqc: """ Run FastQC """ - input: - 'fastqs/{sample}.{genotype}.R1.cutadapt.fastq.gz' + input: + '{sample_dir}/{sample}/{sample}{suffix}' output: - html='data/{sample}/fastqc/{sample}.{genotype}.cutadapt.fastqc.html', - zip='data/{sample}/fastqc/{sample}.{genotype}.cutadapt.fastqc.zip', - threads: 6 - shell: - 'fastqc ' - '--threads {threads} ' - '--noextract ' - '--quiet ' - '{input} ' - '&& mv fastqs/{wildcards.sample}.{wildcards.genotype}.R1.cutadapt_fastqc.html {output.html} ' - '&& mv fastqs/{wildcards.sample}.{wildcards.genotype}.R1.cutadapt_fastqc.zip {output.zip}' - + html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html', + zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip', + script: + wrapper_for('fastqc/wrapper.py') rule bwa_index: input: fasta=config['fasta'], output: - prefix + '.sa', + fasta_prefix + '.sa', params: algorithm='bwtsw', - prefix=prefix + prefix=fasta_prefix shell: 'bwa index ' '{params.prefix} ' @@ -176,14 +135,14 @@ rule bwa_alignment: bwa alignment """ input: - sa = prefix + '.sa', - R1 = 'fastqs/{sample}.{genotype}.R1.cutadapt.fastq.gz', - R2 = 'fastqs/{sample}.{genotype}.R2.cutadapt.fastq.gz' + sa = fasta_prefix + '.sa', + R1 = 'data/{sample}/{sample}.{genotype}.r1.cutadapt.fastq.gz', + R2 = 'data/{sample}/{sample}.{genotype}.r2.cutadapt.fastq.gz' output: bam=temp(c.patterns['bam']) threads: 6 params: - index=prefix + index=fasta_prefix run: sam=output.bam.replace('.bam', '.sam') gt = wildcards.genotype.upper() @@ -207,16 +166,16 @@ rule bwa_alignment: rule index_bams: input: - c.patterns['bam'] + '{prefix}.bam' output: - c.patterns['bam'] + '.bai' + '{prefix}.bam.bai' shell: 'samtools index {input}' rule mark_duplicates: input: bam=c.patterns['bam'], - index=c.patterns['bam'] + index=c.patterns['bam'] + '.bai' output: bam=temp(c.patterns['markduplicates']['bam']), metrics=c.patterns['markduplicates']['metrics'] @@ -254,9 +213,8 @@ rule index_population_variant_file: output: config['population_variant_file'] + '.tbi' shell: - 'gatk IndexFeatureFile ' - '--feature-file {input} ' - '> {output}' + 'tabix -p vcf ' + '{input} ' rule index_known_snp_files: @@ -310,23 +268,20 @@ rule apply_base_recalibration: bqsr_table=c.patterns['baserecalibration']['bqsrtable'] output: c.patterns['baserecalibration']['bam'] - shell: - 'gatk ApplyBQSR ' - '-I {input.bam} ' - '-O {output} ' - '-R {input.fasta} ' - '--bqsr-recal-file {input.bqsr_table} ' - '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + run: + shell( + 'gatk ApplyBQSR ' + '-I {input.bam} ' + '-O {output} ' + '-R {input.fasta} ' + '--bqsr-recal-file {input.bqsr_table} ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + ) + # The created index doesn't work with downstream, and will be + # re-created later. + index = output[0].replace('.bam', '.bai') + shell('rm {index}') -rule index_bams_2: - input: - bam=c.patterns['baserecalibration']['bam'], - old_index=c.patterns['baserecalibration']['bam'].replace('.bam', '.bai') - output: - c.patterns['baserecalibration']['bam'] + '.bai' - shell: - 'samtools index {input.bam}' - '&& rm {input.old_index}' rule get_pileup_summaries: """ @@ -336,9 +291,10 @@ rule get_pileup_summaries: bam=c.patterns['baserecalibration']['bam'], index = c.patterns['baserecalibration']['bam'] + '.bai', PV=config['population_variant_file'], + PV_index=config['population_variant_file'] + '.tbi', exome_capture=config['exome_capture'] output: - c.patterns['pileupsummaries'] + c.patterns['pileupsummaries']['bam'] shell: 'gatk GetPileupSummaries ' '-L {input.exome_capture} ' @@ -347,7 +303,7 @@ rule get_pileup_summaries: '--variant {input.PV} ' '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' -if config['tumor_only']: +if c.tumoronly: rule tumoronly_mutect2: input: tumor=c.patterns['baserecalibration']['tumor'], @@ -360,17 +316,17 @@ if config['tumor_only']: output: temp(c.patterns['mutect2']['snp']) run: - if config['tumor_only']: - shell( - 'gatk Mutect2 ' - '-R {input.fasta} ' - '-I {input.tumor} ' - '--intervals {input.exome_capture} ' - '-O {output} ' - '--germline-resource {input.PV} ' - # '--panel-of-normals {input.PoN} ' - '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' - ) + shell( + 'gatk Mutect2 ' + '-R {input.fasta} ' + '-I {input.tumor} ' + '--intervals {input.exome_capture} ' + '--independent-mates ' + '-O {output} ' + '--germline-resource {input.PV} ' +# '--panel-of-normals {input.PoN} ' + '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"' + ) rule tumoronly_calculate_contamination: input: @@ -420,7 +376,9 @@ else: rule configure_strelka: input: normal=c.patterns['baserecalibration']['normal'], + normal_index=c.patterns['baserecalibration']['normal'] + '.bai', tumor=c.patterns['baserecalibration']['tumor'], + tumor_index=c.patterns['baserecalibration']['tumor'] + '.bai', fasta=config['fasta'] output: c.patterns['strelka']['script'] @@ -499,6 +457,7 @@ else: '-R {input.fasta} ' '-I {input.tumor} ' '-I {input.normal} ' + '--independent-mates ' '--intervals {input.exome_capture} ' '-O {output} ' '--germline-resource {input.PV} ' @@ -595,7 +554,7 @@ rule get_mutect2_passed_only_variants: '> {output}') -if not config['tumor_only']: +if not c.tumoronly: # the varscan and somatic sniper vcfs have a common header line in slightly # different formats which is messing with the merging, so this rule creates # a varscan vcf with that header line replaced with the somatic sniper version. @@ -653,8 +612,8 @@ if not config['tumor_only']: varscan_tbi=c.patterns['varscan']['bgzipped'] + '.tbi', strelka=c.patterns['strelka']['bgzipped'], strelka_tbi=c.patterns['strelka']['bgzipped'] + '.tbi', - somaticsniper= c.patterns['strelka']['bgzipped'], - somaticsniper_tbi=c.patterns['strelka']['bgzipped'] + '.tbi' + somaticsniper= c.patterns['somaticsniper']['bgzipped'], + somaticsniper_tbi=c.patterns['somaticsniper']['bgzipped'] + '.tbi' shell: 'bgzip {input.mutect2} ' '&& tabix -p vcf {output.mutect2} ' @@ -721,7 +680,7 @@ rule snpeff_annotation: """ input: vcf=lambda wildcards: (c.patterns['mutect2']['passedonly'] - if config['tumor_only'] + if c.tumoronly else c.patterns['combinecallers']['gatk']) output: stats=c.patterns['snpeff']['stats'], @@ -770,7 +729,8 @@ rule pull_LOH_calls: rule pull_intersect_calls: """ - This rule pulls out calls that were called by all variant callers and puts them into their own file + This rule pulls out calls that were called by all variant callers and puts + them into their own file """ input: c.patterns['dbnsfp'] @@ -781,7 +741,7 @@ rule pull_intersect_calls: rule get_bed_from_gtf: input: - config['gtf'] + c.refdict[c.organism][config['gtf']['tag']]['gtf'] output: 'bed_from_gtf.bed' shell: @@ -794,7 +754,7 @@ rule filter_out_singlecalled: output: c.patterns['nosinglecalled'] script: - 'filter-out-singlecalled.py' + 'scripts/filter-out-singlecalled.py' rule pull_somatic_calls: @@ -826,7 +786,7 @@ rule graph_variant_distribution_across_genes: c.patterns['genedist']['histogram'], c.patterns['genedist']['somaticonlyhistogram'] script: - 'graph_snp_dist.py' + 'scripts/graph_snp_dist.py' diff --git a/workflows/WES-somatic/config/WES_patterns.yaml b/workflows/WES-somatic/config/WES_patterns.yaml index 4b2b5482..ab03e8c6 100644 --- a/workflows/WES-somatic/config/WES_patterns.yaml +++ b/workflows/WES-somatic/config/WES_patterns.yaml @@ -1,11 +1,9 @@ -fastq: - combined: 'data/{sample}/{sample}.{genotype}.r{n}.fastq.gz' - tumor: 'data/{sample}/{sample}.tumor.r{n}.fastq.gz' - normal: 'data/{sample}/{sample}.normal.r{n}.fastq.gz' +fastq: 'data/{sample}/{sample}.{genotype}.r{n}.fastq.gz' cutadapt: 'data/{sample}/{sample}.{genotype}.r{n}.cutadapt.fastq.gz' bam: 'data/{sample}/{sample}.{genotype}.cutadapt.bam' fastqc: - cutadapt: 'data//{sample}/fastqc/{sample}.{genotype}.R1.cutadapt.fastq.gz_fastqc.zip' + raw: 'data/{sample}/fastqc/{sample}.{genotype}.r1.fastq.gz_fastqc.zip' + cutadapt: 'data/{sample}/fastqc/{sample}.{genotype}.r1.cutadapt.fastq.gz_fastqc.zip' bam: 'data/{sample}/fastqc/{sample}.{genotype}.cutadapt.bam_fastqc.zip' #fastq_screen: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.screen.txt' #featurecounts: 'data/rnaseq_aggregation/featurecounts.txt' @@ -39,7 +37,7 @@ strelka: somaticsniper: snp: 'data/{sample}/{sample}.somaticsniper.vcf' ambigremoved: 'data/{sample}/{sample}.somaticsniper.ambig_removed.vcf' - bgzipped: 'data/{sample}/{sample}.somaticsniper.ambig_removed.vcf' + bgzipped: 'data/{sample}/{sample}.somaticsniper.ambig_removed.vcf.gz' mutect2: snp: 'data/{sample}/{sample}.mutect2.vcf' contamination: 'data/{sample}/{sample}.contamination.table' @@ -47,10 +45,10 @@ mutect2: sequencingmetrics: 'data/{sample}/{sample}.seqartifactmetrics.pre_adapter_detail_metrics' twicefiltered: 'data/{sample}/{sample}.mutect2.twicefiltered.vcf' passedonly: 'data/{sample}/{sample}.mutect2.passedonly.vcf' - bgzipped: 'data/{sample}/{sample}/mutect2.passedonly.vcf.gz' + bgzipped: 'data/{sample}/{sample}.mutect2.passedonly.vcf.gz' combinecallers: gatk: 'data/{sample}/{sample}.merged-callers.vcf.gz' - vcftools: 'data/{sample}/sample}.vcftools-merged-callers.vcf' + vcftools: 'data/{sample}/{sample}.vcftools-merged-callers.vcf' snpeff: stats: 'data/{sample}/{sample}.snpeff_stats.html' vcf: 'data/{sample}/{sample}.snpeff.vcf' diff --git a/workflows/WES-somatic/config/config.yaml b/workflows/WES-somatic/config/config.yaml index 35ed40e7..63432094 100644 --- a/workflows/WES-somatic/config/config.yaml +++ b/workflows/WES-somatic/config/config.yaml @@ -1,4 +1,11 @@ -paired_end: True +sampletable: 'config/sampletable.tsv' + +organism: 'human' + +references_dir: 'references_data' + +gtf: + tag: 'gencode-v28' genome: 'hg38' @@ -16,10 +23,7 @@ population_variant_file: 'somatic-hg38_af-only-gnomad.hg38.vcf.gz' dbNSFP_file: 'dbNSFP2.9.txt.gz' -gtf: '/data/NICHD-core0/references/human/gencode-v28/gtf/human_gencode-v28.gtf' - sequencing_platform: 'Illumina' -tumor_only: False - - +include_references: + - '../../include/reference_configs/Homo_sapiens.yaml' diff --git a/workflows/WES-somatic/config/sampletable.tsv b/workflows/WES-somatic/config/sampletable.tsv index 8c90edfc..d39f430d 100644 --- a/workflows/WES-somatic/config/sampletable.tsv +++ b/workflows/WES-somatic/config/sampletable.tsv @@ -1,30 +1,6 @@ -sample_id tumor_filename normal_filename tumor_R2_filename normal_R2_filename -A27LT10a_vs_A27LT7a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz -A27LT10a_vs_A27RT18a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz -A27LT10a_vs_A27RT19a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz -A27LT7a_vs_A27LT10a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz -A27LT7a_vs_A27RT18a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz -A27LT7a_vs_A27RT19a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz -A27RT18a_vs_A27LT7a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz -A27RT18a_vs_A27LT10a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz -A27RT18a_vs_A27RT19a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz -A27RT19a_vs_A27LT7a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT7a/A27LT7a_FKDN190645657-1A_HWGTCCCXY_L3_2.fq.gz -A27RT19a_vs_A27LT10a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27LT10a/A27LT10a_FKDN190645658-1A_lanes_catted_2.fq.gz -A27RT19a_vs_A27RT18a /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT19a/A27RT19a_FKDN190645660-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A27RT18a/A27RT18a_FKDN190645659-1A_lanes_catted_2.fq.gz -A368LT2a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz -A368LT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz -A368RT22a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz -A368RT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz -A368LT2a_vs_A368LT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz -A368LT2a_vs_A368RT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz -A368LT2a_vs_A368RT22a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz -A368LT3a_vs_A368LT2a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz -A368LT3a_vs_A368RT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz -A368LT3a_vs_A368RT22a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz -A368RT22a_vs_A368LT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz -A368RT22a_vs_A368LT2a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz -A368RT22a_vs_A368RT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz -A368RT3a_vs_A368LT3a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz -A368RT3a_vs_A368LT2a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz -A368RT3a_vs_A368RT22a /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz +sample_id layout tumor_filename normal_filename tumor_R2_filename normal_R2_filename +A368LT2a PE /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz +A368LT3a PE /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz +A368RT22a PE /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz +A368RT3a PE /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz /data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz From ab8032499b614ca65f818fcd181d6600b3b3d2eb Mon Sep 17 00:00:00 2001 From: Sydney Rose Hertafeld Date: Wed, 13 Nov 2019 11:41:45 -0500 Subject: [PATCH 3/4] Switched merging to bcftools, added normalization --- workflows/WES-somatic/Snakefile | 403 ++++++++++++------ .../WES-somatic/config/WES_patterns.yaml | 12 +- 2 files changed, 278 insertions(+), 137 deletions(-) diff --git a/workflows/WES-somatic/Snakefile b/workflows/WES-somatic/Snakefile index 757d482b..8f15de07 100644 --- a/workflows/WES-somatic/Snakefile +++ b/workflows/WES-somatic/Snakefile @@ -8,6 +8,7 @@ import tempfile import pandas as pd from lib import common, cluster_specific, utils, helpers, aligners from lib.patterns_targets import WESConfig +from cyvcf2 import VCF, Writer if not workflow.overwrite_configfile: configfile: 'config/config.yaml' @@ -37,7 +38,7 @@ fasta_prefix = os.path.basename(config['fasta']) final_targets = utils.flatten(( utils.flatten(c.targets['baserecalibration']), - utils.flatten(c.targets['combinecallers']), + c.targets['combinecallers']['bcftools'], utils.flatten(c.targets['fastqc']), [c.targets['loh']], [c.targets['intersect']], @@ -172,6 +173,15 @@ rule index_bams: shell: 'samtools index {input}' +rule index_vcfs: + input: + '{prefix}.vcf.gz' + output: + '{prefix}.vcf.gz.tbi' + shell: + 'tabix -p vcf {input}' + + rule mark_duplicates: input: bam=c.patterns['bam'], @@ -207,24 +217,15 @@ rule get_fasta_dict: shell: 'picard CreateSequenceDictionary REFERENCE={input} OUTPUT={output}' -rule index_population_variant_file: - input: - config['population_variant_file'] - output: - config['population_variant_file'] + '.tbi' - shell: - 'tabix -p vcf ' - '{input} ' - -rule index_known_snp_files: - # NOTE: the known snp files need to be bgzipped rather than gzipped for this rule to work - input: - '{known_sites_file}' - output: - '{known_sites_file}.tbi' - shell: - 'gatk IndexFeatureFile -F {input}' +#rule index_known_snp_files: +# # NOTE: the known snp files need to be bgzipped rather than gzipped for this rule to work +# input: + # '{known_sites_file}' + # output: + # '{known_sites_file}.tbi' + # shell: + # 'gatk IndexFeatureFile -F {input}' def get_known_sites_str(): known_sites_string='' @@ -548,57 +549,87 @@ rule get_mutect2_passed_only_variants: c.patterns['mutect2']['twicefiltered'] output: c.patterns['mutect2']['passedonly'] - run: - shell('bcftools view -i "FILTER=\'PASS\'" ' - '{input} ' - '> {output}') - + shell: + 'bcftools view -i "FILTER=\'PASS\'" ' + '{input} ' + '> {output}' -if not c.tumoronly: - # the varscan and somatic sniper vcfs have a common header line in slightly - # different formats which is messing with the merging, so this rule creates - # a varscan vcf with that header line replaced with the somatic sniper version. - rule adjust_varscan_heading: - input: - varscan=c.patterns['varscan']['snp'], - somaticsniper=c.patterns['somaticsniper']['snp'] - output: - temp(c.patterns['varscan']['headingadjust']) - shell: - 'DP4_line=$(grep ' - '\"FORMAT= {output} ' - - rule remove_ambiguous_calls: - # varscan and somatic sniper will include ambiguous calls (e.g. ref - # column entry is 'W'). This rule removes those calls, as they cause - # issues in the 'combine_variants' rule +if c.tumoronly: + rule process_mutect2_calls: + """ + Normalizes and left aligns the mutect2 calls, as well as splits out the + multiallelic calls. This is the tumor only counterpart to the rule + below called 'process_vcfs_for_merging'. The only difference is that + this rule doesn't prepend the caller name to the INFO and FORMAT fields + since the tumor only pipeline only has one caller so there's no caller + merging to be done. + """ input: - varscan=c.patterns['varscan']['headingadjust'], - somaticsniper=c.patterns['somaticsniper']['snp'] + vcf=c.patterns['mutect2']['passedonly'], + fasta=config['fasta'] output: - varscan=c.patterns['varscan']['ambigremoved'], - somaticsniper=c.patterns['somaticsniper']['ambigremoved'] + c.patterns['mutect2']['processed'] shell: - 'awk "\$1 ~ /^#/ {{print \$0;next}} ' - '{{if (\$4 ~ /A|C|T|G/ ' - '&& \$5 ~ /A|C|T|G/) ' - 'print \$0}}" ' - '{input.varscan} ' - '> {output.varscan} ' - '&& awk "\$1 ~ /^#/ {{print \$0;next}} ' - '{{if (\$4 ~ /A|C|T|G/ ' - '&& \$5 ~ /A|C|T|G/) ' - 'print \$0}}" ' - '{input.somaticsniper} ' - '> {output.somaticsniper} ' + 'bcftools norm ' + '-f {input.fasta} ' + '--multiallelics -both ' + '-o {output} ' + '{input}' +else: + +#NOTE: Commented out for now to see if these steps are still necessary when +#using bcftools instead of combine variants + +# rule adjust_varscan_heading: +# """ +# the varscan and somatic sniper vcfs have a common header line in slightly +# different formats which is messing with the merging, so this rule creates +# a varscan vcf with that header line replaced with the somatic sniper version. +# """ +# input: +# varscan=c.patterns['varscan']['snp'], +# somaticsniper=c.patterns['somaticsniper']['snp'] +# output: +# temp(c.patterns['varscan']['headingadjust']) +# shell: +# 'DP4_line=$(grep ' +# '\"FORMAT= {output} ' + +# rule remove_ambiguous_calls: +# """ +# varscan and somatic sniper will include ambiguous calls (e.g. ref +# column entry is 'W'). This rule removes those calls, as they cause +# issues in the 'combine_variants' rule +# """ +# input: +# varscan=c.patterns['varscan']['headingadjust'], +# somaticsniper=c.patterns['somaticsniper']['snp'] +# output: +# varscan=c.patterns['varscan']['ambigremoved'], +# somaticsniper=c.patterns['somaticsniper']['ambigremoved'] +# shell: +# 'awk "\$1 ~ /^#/ {{print \$0;next}} ' +# '{{if (\$4 ~ /A|C|T|G/ ' +# '&& \$5 ~ /A|C|T|G/) ' +# 'print \$0}}" ' +# '{input.varscan} ' +# '> {output.varscan} ' +# '&& awk "\$1 ~ /^#/ {{print \$0;next}} ' +# '{{if (\$4 ~ /A|C|T|G/ ' +# '&& \$5 ~ /A|C|T|G/) ' +# 'print \$0}}" ' +# '{input.somaticsniper} ' +# '> {output.somaticsniper} ' + + rule bgzip_vcfs: input: mutect2=c.patterns['mutect2']['passedonly'], @@ -607,71 +638,195 @@ if not c.tumoronly: somaticsniper= c.patterns['somaticsniper']['ambigremoved'] output: mutect2=c.patterns['mutect2']['bgzipped'], - mutect2_tbi=c.patterns['mutect2']['bgzipped'] + '.tbi', varscan=c.patterns['varscan']['bgzipped'], - varscan_tbi=c.patterns['varscan']['bgzipped'] + '.tbi', strelka=c.patterns['strelka']['bgzipped'], - strelka_tbi=c.patterns['strelka']['bgzipped'] + '.tbi', somaticsniper= c.patterns['somaticsniper']['bgzipped'], - somaticsniper_tbi=c.patterns['somaticsniper']['bgzipped'] + '.tbi' shell: 'bgzip {input.mutect2} ' - '&& tabix -p vcf {output.mutect2} ' '&& bgzip {input.varscan} ' - '&& tabix -p vcf {output.varscan} ' '&& bgzip {input.strelka} ' - '&& tabix -p vcf {output.strelka} ' - '&& bgzip {input.somaticsniper} ' - '&& tabix -p vcf {output.somaticsniper}' + '&& bgzip {input.somaticsniper} ' - rule combine_variants: + rule process_vcfs_for_merging: + """ + Normalizes and left aligns the calls from strelka, varscan, and + somaticsniper (which are only run in non tumor-only mode), as well as + splits out the multiallelic calls. Also prepends the caller name to the + INFO and FORMAT fields for each vcf to prepare for merging. + + NOTE: This might be better off in its own script, so I copied it into + a script called 'scripts/process_vcfs_for_merging.sh' which is all + ready to go in case that ends up being the better option. + """ input: - mutect2=c.patterns['mutect2']['bgzipped'], varscan=c.patterns['varscan']['bgzipped'], + varscan_index=c.patterns['varscan']['bgzipped'] + '.tbi', strelka=c.patterns['strelka']['bgzipped'], - somaticsniper= c.patterns['strelka']['bgzipped'], + strelka_index=c.patterns['strelka']['bgzipped'] + '.tbi', + somaticsniper=c.patterns['somaticsniper']['bgzipped'], + somaticsniper_index=c.patterns['somaticsniper']['bgzipped'] + '.tbi', + mutect2=c.patterns['mutect2']['bgzipped'], + mutect2_index= c.patterns['mutect2']['bgzipped'] + '.tbi', fasta=config['fasta'] output: - c.patterns['combinecallers']['gatk'] - conda: - 'envs/gatk3.yml' + varscan=c.patterns['varscan']['processed'], + strelka=c.patterns['strelka']['processed'], + somaticsniper=c.patterns['somaticsniper']['processed'], + mutect2=c.patterns['mutect2']['processed'] + run: + unzipped_varscan = output.varscan.rstrip('.gz') + unzipped_strelka = output.strelka.rstrip('.gz') + unzipped_somaticsniper = output.somaticsniper.rstrip('.gz') + unzipped_mutect2 = output.mutect2.rstrip('.gz') + shell( + # Normalize and prepend INFO and FORMAT fields for Varscan VCF + 'bcftools norm ' + '-f {input.fasta} ' + '--multiallelics -both ' + '{input.varscan} ' + '| sed "s/INFO\tFORMAT\tNORMAL\tTUMOR' + '/VARSCAN_INFO\tVARSCAN_FORMAT\tVARSCAN_NORMAL\tVARSCAN_TUMOR/g" ' + # '| sed "s/FORMAT/VARSCAN_FORMAT/g" ' + '> {unzipped_varscan} ' + '&& bgzip {unzipped_varscan}' + # Normalize and prepend INFO and FORMAT fields for Strelka VCF + '&& bcftools norm ' + '-f {input.fasta} ' + '--multiallelics -both ' + '{input.strelka} ' + '| sed "s/INFO\tFORMAT\tNORMAL\tTUMOR' + '/STRELKA_INFO\tSTRELKA_FORMAT\tSTRELKA_NORMAL\tSTRELKA_TUMOR/g" ' + # '| sed "s/FORMAT/STRELKA_INFO/g" ' + '> {unzipped_strelka} ' + '&& bgzip {unzipped_strelka}' + # Normalize and prepend INFO and FORMAT fields SomaticSniper VCF + '&& bcftools norm ' + '-f {input.fasta} ' + '--multiallelics -both ' + '{input.somaticsniper} ' + '| sed "s/INFO\tFORMAT\tNORMAL\tTUMOR' + '/SOMATICSNIPER_INFO\tSOMATICSNIPER_FORMAT\tSOMATICSNIPER_NORMAL\tSOMATICSNIPER_TUMOR/g" ' + # '| sed "s/FORMAT/SOMATICSNIPER_FORMAT/g" ' + '> {unzipped_somaticsniper}' + '&& bgzip {unzipped_somaticsniper}' + # Normalize and prepend INFO and FORMAT fields Mutect2 VCF + '&& bcftools norm ' + '-f {input.fasta} ' + '--multiallelics -both ' + '{input.mutect2} ' + '| bcftools annotate -x FORMAT/OBAM,FORMAT/OBAMRC' + '| sed "s/INFO\tFORMAT\tNORMAL\tTUMOR' + '/MUTECT2_INFO\tMUTECT2_FORMAT\tMUTECT2_NORMAL\tMUTECT2_TUMOR/g" ' + # '| sed "s/FORMAT/MUTECT2_FORMAT/g" ' + '> {unzipped_mutect2}' + '&& bgzip {unzipped_mutect2}' + ) + + + + +# rule combine_variants: +# input: +# mutect2=c.patterns['mutect2']['processed'], +# varscan=c.patterns['varscan']['processed'], +# strelka=c.patterns['strelka']['processed'], +# somaticsniper= c.patterns['strelka']['processed'], +# fasta=config['fasta'] +# output: +# c.patterns['combinecallers']['gatk'] +# conda: +# 'envs/gatk3.yml' +# shell: +# 'GenomeAnalysisTK ' +# '-Xmx4g ' +# '-T CombineVariants ' +# '-R {input.fasta} ' +# '--variant:mutect2 {input.mutect2} ' +# '--variant:varscan {input.varscan} ' +# '--variant:strelka {input.strelka} ' +# '--variant:somaticsniper {input.somaticsniper} ' +# '-o {output} ' +# '-genotypeMergeOptions PRIORITIZE ' +# '-priority mutect2,varscan,strelka,somaticsniper' +# '|| gatk3-register GenomeAnalysisTK-3.8-0-ge9d806836.tar.bz2 ' +# '&& GenomeAnalysisTK ' +# '-Xmx4g ' +# '-T CombineVariants ' +# '-R {input.fasta} ' +# '--variant:mutect2 {input.mutect2} ' +# '--variant:varscan {input.varscan} ' +# '--variant:strelka {input.strelka} ' +# '--variant:somaticsniper {input.somaticsniper} ' +# '-o {output} ' +# '-genotypeMergeOptions PRIORITIZE ' +# '-priority mutect2,varscan,strelka,somaticsniper' +# +# rule vcftools_merge: +# input: +# mutect2=c.patterns['mutect2']['processed'], +# varscan=c.patterns['varscan']['processed'], +# strelka=c.patterns['strelka']['processed'], +# somaticsniper= c.patterns['strelka']['processed'] +# output: +# c.patterns['combinecallers']['vcftools'] +# shell: +# 'vcftools merge ' +# '{input} ' +# '> {output}' + + rule bcftools_merge: + """ + Merge the vcfs from all four callers, keeping the duplicate genotype + columns (each will be prepended with an index number) and maintaining + split multiallelics. + """ + input: + mutect2=c.patterns['mutect2']['processed'], + mutect2_index=c.patterns['mutect2']['processed'] + '.tbi', + varscan=c.patterns['varscan']['processed'], + varscan_index=c.patterns['varscan']['processed'] + '.tbi', + strelka=c.patterns['strelka']['processed'], + strelka_index=c.patterns['strelka']['processed'] + '.tbi', + somaticsniper= c.patterns['somaticsniper']['processed'], + somaticsniper_index= c.patterns['somaticsniper']['processed'] + '.tbi' + output: + c.patterns['combinecallers']['bcftools'] shell: - 'GenomeAnalysisTK ' - '-Xmx4g ' - '-T CombineVariants ' - '-R {input.fasta} ' - '--variant:mutect2 {input.mutect2} ' - '--variant:varscan {input.varscan} ' - '--variant:strelka {input.strelka} ' - '--variant:somaticsniper {input.somaticsniper} ' - '-o {output} ' - '-genotypeMergeOptions PRIORITIZE ' - '-priority mutect2,varscan,strelka,somaticsniper' - '|| gatk3-register GenomeAnalysisTK-3.8-0-ge9d806836.tar.bz2 ' - '&& GenomeAnalysisTK ' - '-Xmx4g ' - '-T CombineVariants ' - '-R {input.fasta} ' - '--variant:mutect2 {input.mutect2} ' - '--variant:varscan {input.varscan} ' - '--variant:strelka {input.strelka} ' - '--variant:somaticsniper {input.somaticsniper} ' + 'bcftools merge ' +# '--force-samples ' + '-m none ' '-o {output} ' - '-genotypeMergeOptions PRIORITIZE ' - '-priority mutect2,varscan,strelka,somaticsniper' + '{input.mutect2} ' + '{input.varscan} ' + '{input.strelka} ' + '{input.somaticsniper} ' - rule vcftools_merge: + rule filter_out_singlecalled: + "Filters out calls made by only one variant caller" input: - mutect2=c.patterns['mutect2']['bgzipped'], - varscan=c.patterns['varscan']['bgzipped'], - strelka=c.patterns['strelka']['bgzipped'], - somaticsniper= c.patterns['strelka']['bgzipped'] + c.patterns['combinecallers']['bcftools'] output: - c.patterns['combinecallers']['vcftools'] - shell: - 'vcftools merge ' - '{input} ' - '> {output}' + c.patterns['nosinglecalled'] + #script: + # 'scripts/filter-out-singlecalled.py' + run: + vcf_reader = VCF(input[0]) + vcf_writer = Writer(output[0], vcf_reader) + for record in vcf_reader: + # the 'genotypes' attribute for a cyverse record lists '-1' for the + # alleles if the record was not called. This can help up filter out + # records that were called by only one caller. + try: + record.genotypes + # Strelka doesn't output a genotype field, so for cases where + # only Strelka called a variant, cyVCF will complain that it + # can't find a genotype for it. These calls should get filtered + # out anyway since they've only been called by one caller. + except Exception: + continue + if len([x for x in record.genotypes if x[:2] != [-1, -1]]) > 2: + vcf_writer.write_record(record) + vcf_writer.close() rule snpeff_annotation: @@ -679,9 +834,9 @@ rule snpeff_annotation: Add gene name and id and structural/functional predictions to the vcf """ input: - vcf=lambda wildcards: (c.patterns['mutect2']['passedonly'] + vcf=lambda wildcards: (c.patterns['mutect2']['processed'] if c.tumoronly - else c.patterns['combinecallers']['gatk']) + else c.patterns['nosinglecalled']) output: stats=c.patterns['snpeff']['stats'], vcf=c.patterns['snpeff']['vcf'] @@ -727,17 +882,6 @@ rule pull_LOH_calls: shell: 'grep -E "(#|SS=3)" {input} > {output}' -rule pull_intersect_calls: - """ - This rule pulls out calls that were called by all variant callers and puts - them into their own file - """ - input: - c.patterns['dbnsfp'] - output: - c.patterns['intersect'] - shell: - 'grep -E "(#|set=Intersection)" {input} > {output}' rule get_bed_from_gtf: input: @@ -747,20 +891,11 @@ rule get_bed_from_gtf: shell: 'grep -w "gene" {input} | cut -f 1,4,5 > {output}' -rule filter_out_singlecalled: - "Filters out calls made by only one variant caller" - input: - c.patterns['dbnsfp'] - output: - c.patterns['nosinglecalled'] - script: - 'scripts/filter-out-singlecalled.py' - rule pull_somatic_calls: "pulls out calls deemed 'somatic' by the callers" input: - c.patterns['nosinglecalled'] + c.patterns['dbnsfp'] output: c.patterns['somaticonly'] shell: @@ -769,7 +904,7 @@ rule pull_somatic_calls: rule intersect_gtf_bed_and_vcf: input: gtf_bed='bed_from_gtf.bed', - vcf=c.patterns['nosinglecalled'], + vcf=c.patterns['dbnsfp'], somaticonly_vcf=c.patterns['somaticonly'] output: bed=c.patterns['genedist']['bed'], diff --git a/workflows/WES-somatic/config/WES_patterns.yaml b/workflows/WES-somatic/config/WES_patterns.yaml index ab03e8c6..77917158 100644 --- a/workflows/WES-somatic/config/WES_patterns.yaml +++ b/workflows/WES-somatic/config/WES_patterns.yaml @@ -27,6 +27,8 @@ varscan: headingadjust: 'data/{sample}/{sample}.varscan.heading_adj.snp.vcf' ambigremoved: 'data/{sample}/{sample}.varscan.heading_adj.ambig_removed.vcf' bgzipped: 'data/{sample}/{sample}.varscan.heading_adj.ambig_removed.vcf.gz' + processed: 'data/{sample}/{sample}.varscan.heading_adj.ambig_removed.processed.vcf.gz' + strelka: script: 'data/{sample}/runWorkflow.py' snp: 'data/{sample}/{sample}.strelka.snp.vcf' @@ -34,10 +36,12 @@ strelka: passedonlysnp: 'data/{sample}/{sample}.strelka.passedonly.snp.vcf' passedonlyindel: 'data/{sample}/{sample}.strelka.passedonly.indel.vcf' bgzipped: 'data/{sample}/{sample}.strelka.passedonly.snp.vcf.gz' + processed: 'data/{sample}/{sample}.strelka.passedonly.processed.snp.vcf.gz' somaticsniper: snp: 'data/{sample}/{sample}.somaticsniper.vcf' ambigremoved: 'data/{sample}/{sample}.somaticsniper.ambig_removed.vcf' bgzipped: 'data/{sample}/{sample}.somaticsniper.ambig_removed.vcf.gz' + processed: 'data/{sample}/{sample}.somaticsniper.ambig_removed.processed.vcf.gz' mutect2: snp: 'data/{sample}/{sample}.mutect2.vcf' contamination: 'data/{sample}/{sample}.contamination.table' @@ -46,17 +50,19 @@ mutect2: twicefiltered: 'data/{sample}/{sample}.mutect2.twicefiltered.vcf' passedonly: 'data/{sample}/{sample}.mutect2.passedonly.vcf' bgzipped: 'data/{sample}/{sample}.mutect2.passedonly.vcf.gz' + processed: 'data/{sample}/{sample}.mutect2.passedonly.processed.vcf.gz' combinecallers: - gatk: 'data/{sample}/{sample}.merged-callers.vcf.gz' + gatk: 'data/{sample}/{sample}.gatk-combine-variants.vcf.gz' vcftools: 'data/{sample}/{sample}.vcftools-merged-callers.vcf' + bcftools: 'data/{sample}/{sample}.bcftools-merged-callers.vcf' +nosinglecalled: 'data/{sample}/{sample}.no_singlecalled.vcf' snpeff: stats: 'data/{sample}/{sample}.snpeff_stats.html' vcf: 'data/{sample}/{sample}.snpeff.vcf' dbnsfp: 'data/{sample}/{sample}.snpeff.dbnsfp.vcf' loh: 'data/{sample}/{sample}.snpeff.dbnsfp.LOH_only.vcf' intersect: 'data/{sample}/{sample}.snpeff.dbnsfp.intersect_only.vcf' -nosinglecalled: 'data/{sample}/{sample}.snpeff.dbnsfp.no_singlecalled.vcf' -somaticonly: 'data/{sample}/{sample}.snpeff.dbnsfp.no_singlecalled.somaticonly.vcf' +somaticonly: 'data/{sample}/{sample}.snpeff.dbnsfp.somaticonly.vcf' genedist: bed: 'data/{sample}/{sample}.variants-per-gene.bed' somaticonlybed: 'data/{sample}/{sample}.somatic-variants-per-gene.bed' From daf6a4723f3808553996e63db6be1e04f7eff3b7 Mon Sep 17 00:00:00 2001 From: daler Date: Sat, 7 Dec 2019 21:10:54 -0500 Subject: [PATCH 4/4] add scripts --- .../scripts/filter-out-singlecalled.py | 33 ++++++++++++ .../WES-somatic/scripts/graph_snp_dist.py | 29 +++++++++++ .../scripts/process_vcfs_for_merging.sh | 50 +++++++++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 workflows/WES-somatic/scripts/filter-out-singlecalled.py create mode 100644 workflows/WES-somatic/scripts/graph_snp_dist.py create mode 100644 workflows/WES-somatic/scripts/process_vcfs_for_merging.sh diff --git a/workflows/WES-somatic/scripts/filter-out-singlecalled.py b/workflows/WES-somatic/scripts/filter-out-singlecalled.py new file mode 100644 index 00000000..79de9848 --- /dev/null +++ b/workflows/WES-somatic/scripts/filter-out-singlecalled.py @@ -0,0 +1,33 @@ + + +def filter_out_singlecalled(vcf, samplename, outfilename): + with open(outfilename, 'w') as fout: + for line in open(vcf): + if line[0] == '#': + fout.write(line) + else: + isec = ( + line.strip() + .split('\t') + [7].split('set=') + [1].split(';')[0] + ) + if '-' in isec or 'Intersection' in isec: + fout.write(line) + +filter_out_singlecalled(snakemake.input[0], snakemake.wildcards.sample, snakemake.output[0]) + +#if __name__ == '__main__': +# import argparse +# ap= argparse.ArgumentParser() +# ap.add_argument('vcf'), +# ap.add_argument('samplename') +# args=ap.parse_args() +# filter_out_singlecalled( +# args.vcf, +# args.samplename +# ) + + + + diff --git a/workflows/WES-somatic/scripts/graph_snp_dist.py b/workflows/WES-somatic/scripts/graph_snp_dist.py new file mode 100644 index 00000000..fa613a72 --- /dev/null +++ b/workflows/WES-somatic/scripts/graph_snp_dist.py @@ -0,0 +1,29 @@ +import pandas as pd +import matplotlib.pyplot as plt + +def graph_snp_dist(gtf_vcf_bed, somaticonly_gtf_vcf_bed, sample, outfilename, somaticonly_outfilename): + """ + This function will make a histogram of the distribution of variants across + all genes. It takes two vcfs as inputs, one full one and one that has only + the calls marked 'SOMATIC' by the variant callers. It also takes a genes + only bed file that has been converted to a gtf + """ + xlabel = 'Number of Snps in Gene' + ylabel = 'Genes' + df = pd.read_csv(gtf_vcf_bed, sep='\t') + plt.hist(df.iloc[:,3], 50) + plt.title(sample + ' Distribution of Snps across Genes') + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.savefig(outfilename) + plt.close() + df = pd.read_csv(somaticonly_gtf_vcf_bed, sep='\t') + plt.hist(df.iloc[:,3], 50) + plt.title(sample + ' Distribution of Somatic Snps across Genes') + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.savefig(somaticonly_outfilename) + plt.close() + + +graph_snp_dist(snakemake.input[0], snakemake.input[1], snakemake.wildcards.sample, snakemake.output[0], snakemake.output[1]) diff --git a/workflows/WES-somatic/scripts/process_vcfs_for_merging.sh b/workflows/WES-somatic/scripts/process_vcfs_for_merging.sh new file mode 100644 index 00000000..eec881e3 --- /dev/null +++ b/workflows/WES-somatic/scripts/process_vcfs_for_merging.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +""" +This script take in a vcf output from four different somatic variant callers: +strelka, somaticsniper, mutect2, and varscan. It normalizes and left aligns the +calls from each caller, and splits out multiallelic calls. It also prepends the +caller name to the INFO and FORMAT fields for each variant caller. +""" + + +# Normalize and prepend FORMAT and INFO fields for Varscan VCF +( +bcftools norm +-f snakemake.input[4] +--multiallelics -both +snakemake.input[0] +| sed "s/INFO\tFORMAT/VARSCAN_INFO\tVARSCAN_FORMAT/g" +> snakemake.output[0] +) + +# Normalize and prepend FORMAT and INFO fields for Strelka VCF +( +bcftools norm +-f snakemake.input[4] +--multiallelics -both +snakemake.input[1] +| sed "s/INFO\tFORMAT/STRELKA_INFO\tSTRELKA_FORMAT/g" +> snakemake.output[1] +) + +#Normalize and prepend FORMAT and INFO fields for SomaticSniper VCF +( +bcftools norm +-f snakemake.input[4] +--multiallelics -both +snakemake.input[2] +| sed "s/INFO\tFORMAT/SOMATICSNIPER_INFO\tSOMATICSNIPER_FORMAT/g" +> snakemake.output[2] +) + +#Normalize and prepend FORMAT and INFO fields for Mutect2 VCF +( +bcftools norm +-f snakemake.input[4] +--multiallelics -both +snakemake.input[3] +| sed "s/INFO\tFORMAT/MUTECT2_INFO\tMUTECT2_FORMAT/g" +> snakemake.output[3] +) +