diff --git a/lib/common.py b/lib/common.py
index 2e223346f..358e6b260 100644
--- a/lib/common.py
+++ b/lib/common.py
@@ -677,6 +677,25 @@ def is_paired_end(sampletable, sample):
         pass
     return False
 
+def is_tumor_only(sampletable):
+    """
+    For somatic WES pipeline, inspects the sampletable to see if there are only
+    tumor files rather than paired tumor/normal. Assumes the presence of
+    a column named 'normal_filename' indicates that there are paired normal
+    samples for all tumor samples. Does not support mixing tumor/normal and
+    tumor only samples in the same sampletable.
+
+    Parameters
+    ----------
+    sampletable : pandas.DataFrame
+        Only contains columns called 'normal_filename' and (if paired end)
+        'normal_R2_filename' if there are paired normal samples for every tumor
+        sample. Does not support blank/NA 'normal_filename' columns. 
+    """
+    if 'normal_filename' in sampletable.columns:
+        return False
+    return True
+
 
 def fill_r1_r2(sampletable, pattern, r1_only=False):
     """
diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py
index 7d986b795..c7bab263f 100644
--- a/lib/patterns_targets.py
+++ b/lib/patterns_targets.py
@@ -69,6 +69,35 @@ def __init__(self, config, patterns, workdir=None):
             self.n = [1]
 
 
+class WESConfig(SeqConfig):
+    def __init__(self, config, patterns, workdir=None):
+        """
+        Config object specific to WES workflows.
+
+        Fills in patterns to create targets
+
+        Parameters
+        ----------
+
+        config : dict
+
+        patterns : str
+            Path to patterns YAML file
+
+        workdir : str
+            Config, patterns, and all paths in `config` should be interpreted
+            as relative to `workdir`
+        """
+        SeqConfig.__init__(self, config, patterns, workdir)
+        self.tumoronly = common.is_tumor_only(self.sampletable)
+        if self.tumoronly:
+            self.genotype = ['tumor']
+        else:
+            self.genotype = ['tumor', 'normal']
+        self.fill = dict(sample=self.samples, genotype=self.genotype, n=self.n)
+        self.targets = helpers.fill_patterns(self.patterns, self.fill, zip)
+
+
 class RNASeqConfig(SeqConfig):
     def __init__(self, config, patterns, workdir=None):
         """
diff --git a/requirements.txt b/requirements.txt
index 56dca33f6..abb9cc648 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 atropos
+bcftools
 bedtools
 bioconductor-annotationhub
 bioconductor-biocparallel
@@ -14,6 +15,7 @@ bioconductor-sva
 bioconductor-tximport
 biopython >=1.68
 bowtie2
+bwa
 cutadapt
 deeptools >=3.0.1
 fastqc
@@ -23,6 +25,7 @@ fastq-screen
 font-ttf-dejavu-sans-mono
 
 gat
+gatk4
 gffutils >=0.8.7.1
 ghostscript
 git
@@ -69,6 +72,8 @@ samtools >=1.4.1
 scipy >=0.18.1
 seaborn >=0.7.1
 snakemake==5.5.4
+snpeff
+somatic-sniper
 sra-tools
 star
 subread
@@ -84,3 +89,5 @@ ucsc-liftover
 ucsc-oligomatch
 ucsc-twobittofa
 ucsc-wigtobigwig
+varscan
+vcftools
diff --git a/workflows/WES-somatic/Snakefile b/workflows/WES-somatic/Snakefile
new file mode 100644
index 000000000..8f15de07e
--- /dev/null
+++ b/workflows/WES-somatic/Snakefile
@@ -0,0 +1,927 @@
+import sys
+
+sys.path.insert(0, srcdir('../..'))
+import os
+from textwrap import dedent
+import yaml
+import tempfile
+import pandas as pd
+from lib import common, cluster_specific, utils, helpers, aligners
+from lib.patterns_targets import WESConfig
+from cyvcf2 import VCF, Writer
+
+if not workflow.overwrite_configfile:
+    configfile: 'config/config.yaml'
+else:
+    configfile: workflow.overwrite_configfile
+
+include: '../references/Snakefile'
+shell.prefix(
+    'set -euo pipefail; export R_PROFILE_USER=; export TMPDIR={}'
+    .format(cluster_specific.tempdir_for_biowulf())
+)
+shell.executable('/bin/bash')
+
+config = common.load_config(config)
+
+c = WESConfig(config, config.get('patterns', 'config/WES_patterns.yaml'))
+
+wildcard_constraints:
+    n = '[1,2]'
+
+def wrapper_for(path):
+    return 'file:' + os.path.join('../..','wrappers','wrappers',path)
+
+known_sites_files=config['known_sites']
+
+fasta_prefix = os.path.basename(config['fasta'])
+
+final_targets = utils.flatten((
+    utils.flatten(c.targets['baserecalibration']),
+    c.targets['combinecallers']['bcftools'],
+    utils.flatten(c.targets['fastqc']),
+    [c.targets['loh']],
+    [c.targets['intersect']],
+    utils.flatten(c.targets['genedist'])
+))
+
+rule targets:
+    input: final_targets
+
+def render_r1_r2(pattern, r1_only=False):
+    return sorted(expand(pattern, sample='{sample}', genotype='{genotype}', n=c.n))
+
+
+if 'filename' in ''.join(c.sampletable.columns):
+    # Convert the sampletable to be indexed by the first column, for
+    # convenience in generating the input/output filenames.
+    _st = c.sampletable.set_index(c.sampletable.columns[0])
+    def filenames_for_sample(wc):
+        if c.is_paired:
+            return _st.loc[wc.sample, [wc.genotype + '_filename', wc.genotype + '_R2_filename']]
+        else:
+            return _st.loc[wc.sample, [wc.genotype + '_filename']]
+
+    rule symlinks:
+        input:
+            filenames_for_sample
+        output:
+            render_r1_r2(c.patterns['fastq'])
+        run:
+            assert len(output) == len(input), (input, output)
+            for src, linkname in zip(input, output):
+                utils.make_relative_symlink(src, linkname)
+
+rule cutadapt:
+    input:
+        render_r1_r2(c.patterns['fastq'])
+    output:
+        render_r1_r2(c.patterns['cutadapt'])
+#    log:
+#        render_r1_r2(c.patterns['cutadapt'][0] + '.log')
+    threads: 6
+    run:
+        if c.is_paired:
+            shell(
+                'cutadapt '
+                '-o {output[0]} '
+                '-p {output[1]} '
+                '-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA '
+                '-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT '
+                '-q 20 '
+                '-j {threads} '
+                '--minimum-length 25 '
+                '{input[0]} '
+                '{input[1]} '
+#                '&> {log}'
+            )
+        else:
+            shell(
+                'cutadapt '
+                '-o {output[0]} '
+                '-a AGATCGGAAGAGCACGTCTGAACTCCAGTCA '
+                '-q 20 '
+                '--minimum-length 25 '
+                '{input[0]} '
+#                '&> {log}'
+            )
+rule fastqc:
+    """
+    Run FastQC
+    """
+    input:
+        '{sample_dir}/{sample}/{sample}{suffix}'
+    output:
+       html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html',
+       zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip',
+    script:
+        wrapper_for('fastqc/wrapper.py')
+
+rule bwa_index:
+    input:
+        fasta=config['fasta'],
+    output:
+        fasta_prefix + '.sa',
+    params:
+        algorithm='bwtsw',
+        prefix=fasta_prefix
+    shell:
+        'bwa index '
+        '{params.prefix} '
+        '{params.algorithm}'
+
+
+rule bwa_alignment:
+    """
+    bwa alignment
+    """
+    input:
+        sa = fasta_prefix + '.sa',
+        R1 = 'data/{sample}/{sample}.{genotype}.r1.cutadapt.fastq.gz',
+        R2 = 'data/{sample}/{sample}.{genotype}.r2.cutadapt.fastq.gz'
+    output:
+        bam=temp(c.patterns['bam'])
+    threads: 6
+    params:
+        index=fasta_prefix
+    run:
+        sam=output.bam.replace('.bam', '.sam')
+        gt = wildcards.genotype.upper()
+        platform=config['sequencing_platform']
+        shell(
+            "bwa mem "
+            "-t {threads} "
+            "-R '@RG\\tID:{gt}"
+            "\\tPL:{platform}"
+            "\\tLB:{gt}"
+            "\\tSM:{gt}' "
+            "{params.index} "
+            "{input.R1} "
+            "{input.R2} "
+            "> {sam}")
+
+        shell(
+            'samtools view -Sb {sam} '
+            '| samtools sort - -o {output.bam} -O BAM '
+            '&& rm {sam}')
+
+rule index_bams:
+    input:
+        '{prefix}.bam'
+    output:
+        '{prefix}.bam.bai'
+    shell:
+        'samtools index {input}'
+
+rule index_vcfs:
+    input:
+        '{prefix}.vcf.gz'
+    output:
+        '{prefix}.vcf.gz.tbi'
+    shell:
+        'tabix -p vcf {input}'
+
+
+rule mark_duplicates:
+    input:
+        bam=c.patterns['bam'],
+        index=c.patterns['bam'] + '.bai'
+    output:
+        bam=temp(c.patterns['markduplicates']['bam']),
+        metrics=c.patterns['markduplicates']['metrics']
+    shell:
+        'gatk MarkDuplicates '
+        '-I {input.bam} '
+        '-O {output.bam} '
+        '--REMOVE_DUPLICATES false '
+        '--METRICS_FILE {output.metrics} '
+        '--CREATE_INDEX true '
+        '--VALIDATION_STRINGENCY LENIENT '
+        '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"'
+
+
+rule index_fasta:
+    input:
+        config['fasta']
+    output:
+        config['fasta'] + '.fai'
+    shell:
+        'samtools faidx {input}'
+
+
+rule get_fasta_dict:
+    input:
+        config['fasta']
+    output:
+        config['fasta'].rstrip('.fa.gz') + '.dict'
+    shell:
+        'picard CreateSequenceDictionary REFERENCE={input} OUTPUT={output}'
+
+
+#rule index_known_snp_files:
+#   # NOTE: the known snp files need to be bgzipped rather than gzipped for this rule to work
+#    input:
+ #       '{known_sites_file}'
+ #   output:
+ #       '{known_sites_file}.tbi'
+ #   shell:
+ #       'gatk IndexFeatureFile -F {input}'
+
+def get_known_sites_str():
+    known_sites_string=''
+    for filename in config['known_sites']:
+        known_sites_string += '--known-sites ' + filename + ' '
+    return known_sites_string
+
+rule base_recalibrator_model:
+    """
+    Recalibrating the bases corrects systematic base scoring errors that the
+    sequencing instrument algorithms may generate in particular regions such as
+    homopolymer runs. GATK's model uses machine learning and works in two
+    steps: one to build the model and one to apply it
+    (apply_base_recalibration)
+    """
+
+    input:
+        fa_index=config['fasta'] + '.fai',
+        fa_dict=config['fasta'].rstrip('.fa.gz') + '.dict',
+        bam=c.patterns['markduplicates']['bam'],
+        fasta=config['fasta'],
+        known_sites_files=expand('{known_sites_file}', known_sites_file=known_sites_files),
+        known_sites_index=config['known_sites'][0] + '.tbi'
+    output:
+        c.patterns['baserecalibration']['bqsrtable']
+    params:
+        known_sites_string=get_known_sites_str()
+    shell:
+        'gatk BaseRecalibrator '
+        '-I {input.bam} '
+        '-O {output} '
+        '-R {input.fasta} '
+        '{params.known_sites_string}'
+        '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"'
+
+
+rule apply_base_recalibration:
+    input:
+        bam=c.patterns['markduplicates']['bam'],
+        fasta=config['fasta'],
+        bqsr_table=c.patterns['baserecalibration']['bqsrtable']
+    output:
+        c.patterns['baserecalibration']['bam']
+    run:
+        shell(
+            'gatk ApplyBQSR '
+            '-I {input.bam} '
+            '-O {output} '
+            '-R {input.fasta} '
+            '--bqsr-recal-file {input.bqsr_table} '
+            '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"'
+        )
+        # The created index doesn't work with downstream, and will be
+        # re-created later.
+        index = output[0].replace('.bam', '.bai')
+        shell('rm {index}')
+
+
+rule get_pileup_summaries:
+    """
+    This is a precursor step to filtering mutect2 calls later on
+    """
+    input:
+        bam=c.patterns['baserecalibration']['bam'],
+        index = c.patterns['baserecalibration']['bam'] + '.bai',
+        PV=config['population_variant_file'],
+        PV_index=config['population_variant_file'] + '.tbi',
+        exome_capture=config['exome_capture']
+    output:
+        c.patterns['pileupsummaries']['bam']
+    shell:
+        'gatk GetPileupSummaries '
+        '-L {input.exome_capture} '
+        '-I {input.bam} '
+        '-O {output} '
+        '--variant {input.PV} '
+        '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"'
+
+if c.tumoronly:
+    rule tumoronly_mutect2:
+        input:
+            tumor=c.patterns['baserecalibration']['tumor'],
+            fasta=config["fasta"],
+            index=c.patterns['baserecalibration']['tumor'] + '.bai',
+    #        PoN=PoN,
+            exome_capture=config['exome_capture'],
+            PV=config['population_variant_file'],
+            PVI=config['population_variant_file'] + '.tbi'
+        output:
+            temp(c.patterns['mutect2']['snp'])
+        run:
+            shell(
+                'gatk Mutect2 '
+                '-R {input.fasta} '
+                '-I {input.tumor} '
+                '--intervals {input.exome_capture} '
+                '--independent-mates '
+                '-O {output} '
+                '--germline-resource {input.PV} '
+#                '--panel-of-normals {input.PoN} '
+                '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"'
+            )
+
+    rule tumoronly_calculate_contamination:
+        input:
+            tumor=c.patterns['pileupsummaries']['tumor'],
+        output:
+            c.patterns['mutect2']['contamination']
+        shell:
+            'gatk CalculateContamination '
+            '-I {input.tumor} '
+            '-O {output} '
+            '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"'
+
+
+else:
+    rule samtools_mpileup:
+        input:
+            normal=c.patterns['baserecalibration']['normal'],
+            tumor=c.patterns['baserecalibration']['tumor'],
+            fasta=config['fasta']
+        output:
+            temp(c.patterns['varscan']['samtoolsmpileup'])
+        shell:
+            'samtools mpileup '
+            '-q 1 '
+            '-d 5000 '
+            '-f {input.fasta} '
+            '{input.normal} '
+            '{input.tumor} '
+            '> {output}'
+
+
+    rule varscan:
+        input:
+            pileup=c.patterns['varscan']['samtoolsmpileup']
+        output:
+            snp=c.patterns['varscan']['snp'],
+            indel=c.patterns['varscan']['indel']
+        shell:
+            'varscan somatic '
+            '{input.pileup} '
+            'data/{wildcards.sample}/{wildcards.sample}.varscan '
+            '--mpileup 1 '
+            '--output-vcf 1 '
+            '--min-var-freq 0.10'
+
+
+    rule configure_strelka:
+        input:
+            normal=c.patterns['baserecalibration']['normal'],
+            normal_index=c.patterns['baserecalibration']['normal'] + '.bai',
+            tumor=c.patterns['baserecalibration']['tumor'],
+            tumor_index=c.patterns['baserecalibration']['tumor'] + '.bai',
+            fasta=config['fasta']
+        output:
+            c.patterns['strelka']['script']
+        conda:
+            'envs/strelka.yml'
+        shell:
+            'configureStrelkaSomaticWorkflow.py '
+            '--normalBam={input.normal} '
+            '--tumorBam={input.tumor} '
+            '--referenceFasta={input.fasta} '
+            '--runDir data/{wildcards.sample}/'
+
+
+    rule run_strelka:
+        input:
+            c.patterns['strelka']['script']
+        output:
+            snp=c.patterns['strelka']['snp'],
+            indel=c.patterns['strelka']['indel']
+        conda:
+            'envs/strelka.yml'
+        shell:
+            '{input} '
+            '-m local '
+            '&& mv data/{wildcards.sample}/results/variants/somatic.snvs.vcf.gz {output.snp} '
+            '&& mv data/{wildcards.sample}/results/variants/somatic.indels.vcf.gz {output.indel}'
+
+    rule get_strelka_passedonly:
+        input:
+            snp=c.patterns['strelka']['snp'],
+            indel=c.patterns['strelka']['indel']
+        output:
+            snp=c.patterns['strelka']['passedonlysnp'],
+            indel=c.patterns['strelka']['passedonlyindel'],
+        run:
+            shell(
+                'bcftools view -i "FILTER=\'PASS\'" ' 
+                '{input.snp} '
+                '> {output.snp}')
+            shell(
+                'bcftools view -i "FILTER=\'PASS\'" ' 
+                '{input.indel} '
+                '> {output.indel}')
+
+
+    rule somatic_sniper:
+        input:
+            normal=c.patterns['baserecalibration']['normal'],
+            tumor=c.patterns['baserecalibration']['tumor'],
+            fasta=config['fasta']
+        output:
+            c.patterns['somaticsniper']['snp']
+        shell:
+            'bam-somaticsniper '
+            '-F vcf '
+            '-f {input.fasta} '
+            '{input.tumor} '
+            '{input.normal} '
+            '{output}'
+
+    rule mutect2:
+        input:
+            tumor=c.patterns['baserecalibration']['tumor'],
+            tumor_index=c.patterns['baserecalibration']['tumor'] + '.bai',
+            normal=c.patterns['baserecalibration']['normal'],
+            normal_index=c.patterns['baserecalibration']['normal'] + '.bai',
+            fasta=config["fasta"],
+    #        PoN=PoN,
+            exome_capture=config['exome_capture'],
+            PV=config['population_variant_file'],
+            PVI=config['population_variant_file'] + '.tbi'
+        output:
+            c.patterns['mutect2']['snp']
+        shell:
+            'gatk Mutect2 '
+            '-R {input.fasta} '
+            '-I {input.tumor} '
+            '-I {input.normal} '
+            '--independent-mates '
+            '--intervals {input.exome_capture} '
+            '-O {output} '
+            '--germline-resource {input.PV} '
+    #        '--panel-of-normals {input.PoN} '
+            '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"'
+
+
+    rule calculate_contamination:
+        """
+        Estimates the proportion of reads originating from other samples
+        """
+        input:
+            tumor=c.patterns['pileupsummaries']['tumor'],
+            normal=c.patterns['pileupsummaries']['normal']
+        output:
+            c.patterns['mutect2']['contamination']
+        shell:
+            'gatk CalculateContamination '
+            '-I {input.tumor} '
+            '-O {output} '
+            '--matched-normal {input.normal} '
+            '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"'
+
+rule filter_out_contaminants:
+    """
+    Filters out reads estimated to have originated from other samples
+    """
+    input:
+        vcf=c.patterns['mutect2']['snp'],
+        contaminants=c.patterns['mutect2']['contamination'],
+        reference=config['fasta']
+    output:
+        temp(c.patterns['mutect2']['filtered'])
+    shell:
+        'gatk FilterMutectCalls '
+        '--variant {input.vcf} '
+        '--reference {input.reference} '
+        '-O {output} '
+        '--contamination-table {input.contaminants} '
+        '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"'
+
+
+rule collect_sequencing_artifact_metrics:
+    """
+    Measures pre-adapter errors to allow for filtering by orientation bias
+    """
+    input:
+        bam=c.patterns['baserecalibration']['tumor'],
+        fasta=config['fasta'],
+        exome_capture=config['exome_capture']
+    output:
+        c.patterns['mutect2']['sequencingmetrics']
+    shell:
+        'gatk CollectSequencingArtifactMetrics '
+        '--INTERVALS {input.exome_capture} '
+        '-I {input.bam} '
+        '-O data/{wildcards.sample}/{wildcards.sample}.seqartifactmetrics '
+        '-R {input.fasta} '
+        '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"'
+
+
+rule filter_by_orientation_bias:
+    """
+    Filter by orientation bias, which catches artifacts formed by chemical
+    changes that occur in the DNA on a single strand during sample prep. 
+    """
+    input:
+        vcf=c.patterns['mutect2']['filtered'],
+        detail_file=c.patterns['mutect2']['sequencingmetrics']
+    output:
+        c.patterns['mutect2']['twicefiltered']
+    shell:
+        'gatk FilterByOrientationBias '
+        '--variant {input.vcf} '
+        '--pre-adapter-detail-file {input.detail_file} '
+        '-O {output} '
+        '--java-options "-Xmx8g -Djava.io.tmpdir=$TMPDIR"'
+
+
+rule get_mutect2_passed_only_variants:
+     """
+     Calls that didn't pass all of the filters will still be in the output vcf,
+     they will just be flagged with the filters they didn't pass in the
+     'FILTER' field, so this rule pulls out all of the calls that passed the
+     filters
+     """
+     input:
+         c.patterns['mutect2']['twicefiltered']
+     output:
+         c.patterns['mutect2']['passedonly']
+     shell:
+         'bcftools view -i "FILTER=\'PASS\'" ' 
+         '{input} '
+         '> {output}'
+
+if c.tumoronly:
+    rule process_mutect2_calls:
+        """
+        Normalizes and left aligns the mutect2 calls, as well as splits out the
+        multiallelic calls. This is the tumor only counterpart to the rule
+        below called 'process_vcfs_for_merging'. The only difference is that
+        this rule doesn't prepend the caller name to the INFO and FORMAT fields
+        since the tumor only pipeline only has one caller so there's no caller
+        merging to be done.
+        """
+        input:
+            vcf=c.patterns['mutect2']['passedonly'],
+            fasta=config['fasta']
+        output:
+            c.patterns['mutect2']['processed']
+        shell:
+            'bcftools norm '
+            '-f {input.fasta} '
+            '--multiallelics -both '
+            '-o {output} '
+            '{input}'
+
+
+else:
+
+#NOTE: Commented out for now to see if these steps are still necessary when
+#using bcftools instead of combine variants
+
+#    rule adjust_varscan_heading:
+#       """
+#       the varscan and somatic sniper vcfs have a common header line in slightly
+#       different formats which is messing with the merging, so this rule creates
+#       a varscan vcf with that header line replaced with the somatic sniper version.
+#       """
+#        input:
+#            varscan=c.patterns['varscan']['snp'],
+#            somaticsniper=c.patterns['somaticsniper']['snp']
+#        output:
+#            temp(c.patterns['varscan']['headingadjust'])
+#        shell:
+#            'DP4_line=$(grep '
+#            '\"FORMAT=<ID=DP4\" '
+#            '{input.somaticsniper}) '
+#            '&& sed '
+#            '\"s/FORMAT=<ID=DP4.'
+#            '*/${{DP4_line:2}}/\" '
+#            '{input.varscan} '
+#            '> {output} '
+
+#    rule remove_ambiguous_calls:
+#       """
+#       varscan and somatic sniper will include ambiguous calls (e.g. ref
+#       column entry is 'W'). This rule removes those calls, as they cause
+#       issues in the 'combine_variants' rule
+#       """
+#        input:
+#            varscan=c.patterns['varscan']['headingadjust'],
+#            somaticsniper=c.patterns['somaticsniper']['snp']
+#        output:
+#            varscan=c.patterns['varscan']['ambigremoved'],
+#            somaticsniper=c.patterns['somaticsniper']['ambigremoved']
+#        shell:
+#            'awk "\$1 ~ /^#/ {{print \$0;next}} '
+#            '{{if (\$4 ~ /A|C|T|G/ '
+#            '&& \$5 ~ /A|C|T|G/) '
+#            'print \$0}}" '
+#            '{input.varscan} '
+#            '> {output.varscan} '
+#            '&& awk "\$1 ~ /^#/ {{print \$0;next}} '
+#            '{{if (\$4 ~ /A|C|T|G/ '
+#            '&& \$5 ~ /A|C|T|G/) '
+#            'print \$0}}" '
+#            '{input.somaticsniper} '
+#            '> {output.somaticsniper} '
+
+ 
+    rule bgzip_vcfs:
+        input:
+            mutect2=c.patterns['mutect2']['passedonly'],
+            varscan=c.patterns['varscan']['ambigremoved'],
+            strelka=c.patterns['strelka']['passedonlysnp'],
+            somaticsniper= c.patterns['somaticsniper']['ambigremoved']
+        output:
+            mutect2=c.patterns['mutect2']['bgzipped'],
+            varscan=c.patterns['varscan']['bgzipped'],
+            strelka=c.patterns['strelka']['bgzipped'],
+            somaticsniper= c.patterns['somaticsniper']['bgzipped'],
+        shell:
+            'bgzip {input.mutect2} '
+            '&& bgzip {input.varscan} '
+            '&& bgzip {input.strelka} '
+            '&& bgzip {input.somaticsniper} '           
+
+    rule process_vcfs_for_merging:
+        """
+        Normalizes and left aligns the calls from strelka, varscan, and
+        somaticsniper (which are only run in non tumor-only mode), as well as
+        splits out the multiallelic calls. Also prepends the caller name to the
+        INFO and FORMAT fields for each vcf to prepare for merging.
+
+        NOTE: This might be better off in its own script, so I copied it into
+        a script called 'scripts/process_vcfs_for_merging.sh' which is all
+        ready to go in case that ends up being the better option.
+        """
+        input:
+            varscan=c.patterns['varscan']['bgzipped'],
+            varscan_index=c.patterns['varscan']['bgzipped'] + '.tbi',
+            strelka=c.patterns['strelka']['bgzipped'],
+            strelka_index=c.patterns['strelka']['bgzipped'] + '.tbi',
+            somaticsniper=c.patterns['somaticsniper']['bgzipped'],
+            somaticsniper_index=c.patterns['somaticsniper']['bgzipped'] + '.tbi',
+            mutect2=c.patterns['mutect2']['bgzipped'],
+            mutect2_index= c.patterns['mutect2']['bgzipped'] + '.tbi',
+            fasta=config['fasta']
+        output:
+            varscan=c.patterns['varscan']['processed'],
+            strelka=c.patterns['strelka']['processed'],
+            somaticsniper=c.patterns['somaticsniper']['processed'],
+            mutect2=c.patterns['mutect2']['processed']
+        run:
+            unzipped_varscan = output.varscan.rstrip('.gz')
+            unzipped_strelka = output.strelka.rstrip('.gz')
+            unzipped_somaticsniper = output.somaticsniper.rstrip('.gz')
+            unzipped_mutect2 = output.mutect2.rstrip('.gz')
+            shell(
+                # Normalize and prepend INFO and FORMAT fields for Varscan VCF
+                'bcftools norm '
+                '-f {input.fasta} '
+                '--multiallelics -both '
+                '{input.varscan} '
+                '| sed "s/INFO\tFORMAT\tNORMAL\tTUMOR'
+                '/VARSCAN_INFO\tVARSCAN_FORMAT\tVARSCAN_NORMAL\tVARSCAN_TUMOR/g" '
+    #            '| sed "s/FORMAT/VARSCAN_FORMAT/g" '
+                '> {unzipped_varscan} '
+                '&& bgzip {unzipped_varscan}'
+                # Normalize and prepend INFO and FORMAT fields for  Strelka VCF
+                '&& bcftools norm '
+                '-f {input.fasta} '
+                '--multiallelics -both '
+                '{input.strelka} '
+                '| sed "s/INFO\tFORMAT\tNORMAL\tTUMOR'
+                '/STRELKA_INFO\tSTRELKA_FORMAT\tSTRELKA_NORMAL\tSTRELKA_TUMOR/g" '
+    #            '| sed "s/FORMAT/STRELKA_INFO/g" '
+                '> {unzipped_strelka} '
+                '&& bgzip {unzipped_strelka}'
+                # Normalize and prepend INFO and FORMAT fields SomaticSniper VCF
+                '&& bcftools norm '
+                '-f {input.fasta} '
+                '--multiallelics -both '
+                '{input.somaticsniper} '
+                '| sed "s/INFO\tFORMAT\tNORMAL\tTUMOR'
+                '/SOMATICSNIPER_INFO\tSOMATICSNIPER_FORMAT\tSOMATICSNIPER_NORMAL\tSOMATICSNIPER_TUMOR/g" '
+    #            '| sed "s/FORMAT/SOMATICSNIPER_FORMAT/g" '
+                '> {unzipped_somaticsniper}'
+                '&& bgzip {unzipped_somaticsniper}'
+                # Normalize and prepend INFO and FORMAT fields Mutect2 VCF
+                '&& bcftools norm '
+                '-f {input.fasta} '
+                '--multiallelics -both '
+                '{input.mutect2} '
+                '| bcftools annotate -x FORMAT/OBAM,FORMAT/OBAMRC'
+                '| sed "s/INFO\tFORMAT\tNORMAL\tTUMOR'
+                '/MUTECT2_INFO\tMUTECT2_FORMAT\tMUTECT2_NORMAL\tMUTECT2_TUMOR/g" '
+    #            '| sed "s/FORMAT/MUTECT2_FORMAT/g" '
+                '> {unzipped_mutect2}'
+                '&& bgzip {unzipped_mutect2}'
+            )
+
+
+
+
+#    rule combine_variants:
+#        input:
+#            mutect2=c.patterns['mutect2']['processed'],
+#            varscan=c.patterns['varscan']['processed'],
+#            strelka=c.patterns['strelka']['processed'],
+#            somaticsniper= c.patterns['strelka']['processed'],
+#            fasta=config['fasta']
+#        output:
+#            c.patterns['combinecallers']['gatk']
+#        conda:
+#            'envs/gatk3.yml'
+#        shell:
+#            'GenomeAnalysisTK '
+#            '-Xmx4g '
+#            '-T CombineVariants '
+#            '-R {input.fasta} '
+#            '--variant:mutect2 {input.mutect2} '
+#            '--variant:varscan {input.varscan} '
+#            '--variant:strelka {input.strelka} '
+#            '--variant:somaticsniper {input.somaticsniper} '
+#            '-o {output} '
+#            '-genotypeMergeOptions PRIORITIZE '
+#            '-priority mutect2,varscan,strelka,somaticsniper'
+#            '|| gatk3-register GenomeAnalysisTK-3.8-0-ge9d806836.tar.bz2 '
+#            '&& GenomeAnalysisTK '
+#            '-Xmx4g '
+#            '-T CombineVariants '
+#            '-R {input.fasta} '
+#            '--variant:mutect2 {input.mutect2} '
+#            '--variant:varscan {input.varscan} '
+#            '--variant:strelka {input.strelka} '
+#            '--variant:somaticsniper {input.somaticsniper} '
+#            '-o {output} '
+#            '-genotypeMergeOptions PRIORITIZE '
+#            '-priority mutect2,varscan,strelka,somaticsniper'
+#
+#    rule vcftools_merge:
+#        input:
+#            mutect2=c.patterns['mutect2']['processed'],
+#            varscan=c.patterns['varscan']['processed'],
+#            strelka=c.patterns['strelka']['processed'],
+#            somaticsniper= c.patterns['strelka']['processed']
+#        output:
+#            c.patterns['combinecallers']['vcftools']
+#        shell:
+#            'vcftools merge '
+#            '{input} '
+#            '> {output}'
+
+    rule bcftools_merge:
+        """
+        Merge the vcfs from all four callers, keeping the duplicate genotype
+        columns (each will be prepended with an index number) and maintaining
+        split multiallelics.
+        """
+        input:
+            mutect2=c.patterns['mutect2']['processed'],
+            mutect2_index=c.patterns['mutect2']['processed'] + '.tbi',
+            varscan=c.patterns['varscan']['processed'],
+            varscan_index=c.patterns['varscan']['processed'] + '.tbi',
+            strelka=c.patterns['strelka']['processed'],
+            strelka_index=c.patterns['strelka']['processed'] + '.tbi',
+            somaticsniper= c.patterns['somaticsniper']['processed'],
+            somaticsniper_index= c.patterns['somaticsniper']['processed'] + '.tbi'
+        output:
+            c.patterns['combinecallers']['bcftools']
+        shell:
+            'bcftools merge '
+#            '--force-samples '
+            '-m none '
+            '-o {output} '
+            '{input.mutect2} '
+            '{input.varscan} '
+            '{input.strelka} '
+            '{input.somaticsniper} '
+
+    rule filter_out_singlecalled:
+        "Filters out calls made by only one variant caller"
+        input:
+            c.patterns['combinecallers']['bcftools']
+        output:
+            c.patterns['nosinglecalled']
+        #script:
+        #    'scripts/filter-out-singlecalled.py'
+        run:
+            vcf_reader = VCF(input[0])
+            vcf_writer = Writer(output[0], vcf_reader)
+            for record in vcf_reader:
+                # the 'genotypes' attribute for a cyverse record lists '-1' for the
+                # alleles if the record was not called. This can help up filter out
+                # records that were called by only one caller.
+                try:
+                    record.genotypes
+                # Strelka doesn't output a genotype field, so for cases where
+                # only Strelka called a variant, cyVCF will complain that it
+                # can't find a genotype for it. These calls should get filtered
+                # out anyway since they've only been called by one caller.
+                except Exception:
+                    continue
+                if len([x for x in record.genotypes if x[:2] != [-1, -1]]) > 2:
+                    vcf_writer.write_record(record)
+            vcf_writer.close()
+
+
+rule snpeff_annotation:
+    """
+    Add gene name and id and structural/functional predictions to the vcf
+    """
+    input:
+        vcf=lambda wildcards: (c.patterns['mutect2']['processed'] 
+            if c.tumoronly
+            else c.patterns['nosinglecalled'])
+    output:
+        stats=c.patterns['snpeff']['stats'],
+        vcf=c.patterns['snpeff']['vcf']
+    params:
+        genome=config['genome']
+    shell:
+        'snpEff -Xmx4g '
+        '-v '
+        '-stats {output.stats} '
+        '{params.genome} '
+        '{input} '
+        ' > {output.vcf}'
+
+rule snpsift_dbnsfp_data:
+    input:
+        dbNSFP=config['dbNSFP_file'],
+        dbNSFP_index=config['dbNSFP_file'],
+        vcf=c.patterns['snpeff']['vcf']
+    output:
+        c.patterns['dbnsfp']
+    shell:
+        'SnpSift dbnsfp '
+        '-db {input.dbNSFP} '
+        '-f 1000Gp1_AF,'
+        'clinvar_rs,'
+        'ExAC_AF,'
+        'clinvar_clnsig '
+        '{input.vcf} '
+        '> {output}'
+
+
+rule pull_LOH_calls:
+    """
+    Varscan and Somatic Sniper call LOH (loss of heterozygosity) in the 'SS'
+    entry of their info fields. The SS entries correspond to the following
+    types of calls: 0=reference 1=germline 2=somatic 3=LOH. This rule pulls out
+    all the LOH calls into their own file
+    """
+    input:
+        c.patterns['dbnsfp']
+    output:
+        c.patterns['loh']
+    shell:
+        'grep -E "(#|SS=3)" {input} > {output}' 
+
+
+rule get_bed_from_gtf:
+    input:
+        c.refdict[c.organism][config['gtf']['tag']]['gtf']
+    output:
+        'bed_from_gtf.bed'
+    shell:
+        'grep -w "gene" {input} | cut -f 1,4,5 > {output}'
+
+
+rule pull_somatic_calls:
+    "pulls out calls deemed 'somatic' by the callers"
+    input:
+        c.patterns['dbnsfp']
+    output:
+        c.patterns['somaticonly']
+    shell:
+        'grep -E "(#|SOMATIC)" {input} > {output}'
+
+rule intersect_gtf_bed_and_vcf:
+    input:
+        gtf_bed='bed_from_gtf.bed',
+        vcf=c.patterns['dbnsfp'],
+        somaticonly_vcf=c.patterns['somaticonly']
+    output:
+        bed=c.patterns['genedist']['bed'],
+        somaticonly_bed =c.patterns['genedist']['somaticonlybed']
+    shell:
+        'bedtools intersect -c -a {input.gtf_bed} -b {input.vcf} > {output.bed}'
+        '&& bedtools intersect -c -a {input.gtf_bed} -b {input.somaticonly_vcf} > {output.somaticonly_bed}'
+
+rule graph_variant_distribution_across_genes:
+    input:
+        c.patterns['genedist']['bed'],
+        c.patterns['genedist']['somaticonlybed']
+    output:
+        c.patterns['genedist']['histogram'],
+        c.patterns['genedist']['somaticonlyhistogram']
+    script:
+        'scripts/graph_snp_dist.py'
+
+
+
diff --git a/workflows/WES-somatic/config/WES_patterns.yaml b/workflows/WES-somatic/config/WES_patterns.yaml
new file mode 100644
index 000000000..77917158b
--- /dev/null
+++ b/workflows/WES-somatic/config/WES_patterns.yaml
@@ -0,0 +1,77 @@
+fastq: 'data/{sample}/{sample}.{genotype}.r{n}.fastq.gz'
+cutadapt: 'data/{sample}/{sample}.{genotype}.r{n}.cutadapt.fastq.gz'
+bam: 'data/{sample}/{sample}.{genotype}.cutadapt.bam'
+fastqc:
+  raw: 'data/{sample}/fastqc/{sample}.{genotype}.r1.fastq.gz_fastqc.zip'
+  cutadapt: 'data/{sample}/fastqc/{sample}.{genotype}.r1.cutadapt.fastq.gz_fastqc.zip'
+  bam: 'data/{sample}/fastqc/{sample}.{genotype}.cutadapt.bam_fastqc.zip'
+#fastq_screen: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.screen.txt'
+#featurecounts: 'data/rnaseq_aggregation/featurecounts.txt'
+#multiqc: 'data/rnaseq_aggregation/multiqc.html'
+markduplicates:
+   bam: 'data/{sample}/{sample}.{genotype}.cutadapt.markdups.bam'
+   metrics: 'data/{sample}/{sample}.{genotype}.cutadapt.markdups.bam.metrics'
+baserecalibration:
+  bqsrtable: 'data/{sample}/{sample}.{genotype}.bqsr.table'
+  bam: 'data/{sample}/{sample}.{genotype}.cutadapt.markdups.recal.bam'
+  tumor: 'data/{sample}/{sample}.tumor.cutadapt.markdups.recal.bam'
+  normal: 'data/{sample}/{sample}.normal.cutadapt.markdups.recal.bam'
+pileupsummaries:
+  bam: 'data/{sample}/{sample}.{genotype}.pileups.table'
+  tumor: 'data/{sample}/{sample}.tumor.pileups.table'
+  normal: 'data/{sample}/{sample}.normal.pileups.table'
+varscan:
+  samtoolsmpileup: 'data/varscan/{sample}/{sample}.pileup'
+  snp: 'data/{sample}/{sample}.varscan.snp.vcf'
+  indel: 'data/{sample}/{sample}.varscan.indel.vcf'
+  headingadjust: 'data/{sample}/{sample}.varscan.heading_adj.snp.vcf'
+  ambigremoved: 'data/{sample}/{sample}.varscan.heading_adj.ambig_removed.vcf'
+  bgzipped: 'data/{sample}/{sample}.varscan.heading_adj.ambig_removed.vcf.gz'
+  processed: 'data/{sample}/{sample}.varscan.heading_adj.ambig_removed.processed.vcf.gz'
+
+strelka:
+  script: 'data/{sample}/runWorkflow.py'
+  snp: 'data/{sample}/{sample}.strelka.snp.vcf'
+  indel: 'data/{sample}/{sample}.strelka.indel.vcf'
+  passedonlysnp: 'data/{sample}/{sample}.strelka.passedonly.snp.vcf'
+  passedonlyindel: 'data/{sample}/{sample}.strelka.passedonly.indel.vcf'
+  bgzipped: 'data/{sample}/{sample}.strelka.passedonly.snp.vcf.gz'
+  processed: 'data/{sample}/{sample}.strelka.passedonly.processed.snp.vcf.gz'
+somaticsniper:
+  snp: 'data/{sample}/{sample}.somaticsniper.vcf'
+  ambigremoved: 'data/{sample}/{sample}.somaticsniper.ambig_removed.vcf'
+  bgzipped: 'data/{sample}/{sample}.somaticsniper.ambig_removed.vcf.gz'
+  processed: 'data/{sample}/{sample}.somaticsniper.ambig_removed.processed.vcf.gz'
+mutect2:
+  snp: 'data/{sample}/{sample}.mutect2.vcf'
+  contamination: 'data/{sample}/{sample}.contamination.table'
+  filtered: 'data/{sample}/{sample}.mutect2.oncefiltered.vcf'
+  sequencingmetrics: 'data/{sample}/{sample}.seqartifactmetrics.pre_adapter_detail_metrics'
+  twicefiltered: 'data/{sample}/{sample}.mutect2.twicefiltered.vcf'
+  passedonly: 'data/{sample}/{sample}.mutect2.passedonly.vcf'
+  bgzipped: 'data/{sample}/{sample}.mutect2.passedonly.vcf.gz'
+  processed: 'data/{sample}/{sample}.mutect2.passedonly.processed.vcf.gz'
+combinecallers:
+  gatk: 'data/{sample}/{sample}.gatk-combine-variants.vcf.gz'
+  vcftools: 'data/{sample}/{sample}.vcftools-merged-callers.vcf'
+  bcftools: 'data/{sample}/{sample}.bcftools-merged-callers.vcf'
+nosinglecalled: 'data/{sample}/{sample}.no_singlecalled.vcf'
+snpeff:
+  stats: 'data/{sample}/{sample}.snpeff_stats.html'
+  vcf: 'data/{sample}/{sample}.snpeff.vcf'
+dbnsfp: 'data/{sample}/{sample}.snpeff.dbnsfp.vcf'
+loh: 'data/{sample}/{sample}.snpeff.dbnsfp.LOH_only.vcf'
+intersect: 'data/{sample}/{sample}.snpeff.dbnsfp.intersect_only.vcf'
+somaticonly: 'data/{sample}/{sample}.snpeff.dbnsfp.somaticonly.vcf'
+genedist:
+  bed: 'data/{sample}/{sample}.variants-per-gene.bed'
+  somaticonlybed: 'data/{sample}/{sample}.somatic-variants-per-gene.bed'
+  histogram: 'data/{sample}/{sample}.histogram_of_variant-gene_distribution.png'
+  somaticonlyhistogram: 'data/{sample}/{sample}.histogram_of_somatic_variant-gene_distribution.png'
+
+
+
+
+
+
+
diff --git a/workflows/WES-somatic/config/config.yaml b/workflows/WES-somatic/config/config.yaml
new file mode 100644
index 000000000..634320949
--- /dev/null
+++ b/workflows/WES-somatic/config/config.yaml
@@ -0,0 +1,29 @@
+sampletable: 'config/sampletable.tsv'
+
+organism: 'human'
+
+references_dir: 'references_data'
+
+gtf:
+  tag: 'gencode-v28'
+
+genome: 'hg38'
+
+fasta: 'hg38.fa'
+
+# Vcf of known variants. Can list multiple files if needed
+known_sites:
+  - 'common_all_20170710.vcf.gz'
+
+panel_of_normals: 'None'
+
+exome_capture: 'S07604514_Covered.interval_list'
+
+population_variant_file: 'somatic-hg38_af-only-gnomad.hg38.vcf.gz'
+
+dbNSFP_file: 'dbNSFP2.9.txt.gz'
+
+sequencing_platform: 'Illumina'
+
+include_references:
+  - '../../include/reference_configs/Homo_sapiens.yaml'
diff --git a/workflows/WES-somatic/config/sampletable.tsv b/workflows/WES-somatic/config/sampletable.tsv
new file mode 100644
index 000000000..d39f430d1
--- /dev/null
+++ b/workflows/WES-somatic/config/sampletable.tsv
@@ -0,0 +1,6 @@
+sample_id	layout	tumor_filename	normal_filename	tumor_R2_filename	normal_R2_filename
+A368LT2a	PE	/data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_1.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT2a/A368LT2a_FKDN190645653-1A_HWGTCCCXY_L1_2.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz
+A368LT3a	PE	/data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_1.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368LT3a/A368LT3a_FKDN190645654-1A_HWGTCCCXY_L1_2.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz
+A368RT22a	PE	/data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_1.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT22a/A368RT22a_FKDN190645656-1A_lanes_catted_2.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz
+A368RT3a	PE	/data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_1.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_1.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_2/RawData/A368RT3a/A368RT3a_FKDN190645655-1A_lanes_catted_2.fq.gz	/data/NICHD-core0/data/stratakis/pmah/release_noclean_1/RawData/A368GER/A368GER_FKDN190638163-1A_lanes_catted_2.fq.gz
+
diff --git a/workflows/WES-somatic/scripts/filter-out-singlecalled.py b/workflows/WES-somatic/scripts/filter-out-singlecalled.py
new file mode 100644
index 000000000..79de9848f
--- /dev/null
+++ b/workflows/WES-somatic/scripts/filter-out-singlecalled.py
@@ -0,0 +1,33 @@
+
+
+def filter_out_singlecalled(vcf, samplename, outfilename):
+    with open(outfilename, 'w') as fout:
+        for line in open(vcf):
+            if line[0] == '#':
+                fout.write(line)
+            else:
+                isec = (
+                    line.strip()
+                    .split('\t')
+                    [7].split('set=')
+                    [1].split(';')[0]
+                )
+                if '-' in isec or 'Intersection' in isec:
+                    fout.write(line)
+
+filter_out_singlecalled(snakemake.input[0], snakemake.wildcards.sample, snakemake.output[0])
+
+#if __name__ == '__main__':
+#    import argparse
+#    ap= argparse.ArgumentParser()
+#    ap.add_argument('vcf'),
+#    ap.add_argument('samplename')
+#    args=ap.parse_args()
+#    filter_out_singlecalled(
+#        args.vcf,
+#        args.samplename
+#    )
+
+
+
+                
diff --git a/workflows/WES-somatic/scripts/graph_snp_dist.py b/workflows/WES-somatic/scripts/graph_snp_dist.py
new file mode 100644
index 000000000..fa613a72f
--- /dev/null
+++ b/workflows/WES-somatic/scripts/graph_snp_dist.py
@@ -0,0 +1,29 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+
+def graph_snp_dist(gtf_vcf_bed, somaticonly_gtf_vcf_bed, sample, outfilename, somaticonly_outfilename):
+    """
+    This function will make a histogram of the distribution of variants across
+    all genes. It takes two vcfs as inputs, one full one and one that has only
+    the calls marked 'SOMATIC' by the variant callers. It also takes a genes
+    only bed file that has been converted to a gtf
+    """
+    xlabel = 'Number of Snps in Gene'
+    ylabel = 'Genes'
+    df = pd.read_csv(gtf_vcf_bed, sep='\t')
+    plt.hist(df.iloc[:,3], 50)
+    plt.title(sample + ' Distribution of Snps across Genes')
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+    plt.savefig(outfilename)
+    plt.close()
+    df = pd.read_csv(somaticonly_gtf_vcf_bed, sep='\t')
+    plt.hist(df.iloc[:,3], 50)
+    plt.title(sample + ' Distribution of Somatic Snps across Genes')
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+    plt.savefig(somaticonly_outfilename)
+    plt.close()
+
+
+graph_snp_dist(snakemake.input[0], snakemake.input[1], snakemake.wildcards.sample, snakemake.output[0], snakemake.output[1])
diff --git a/workflows/WES-somatic/scripts/process_vcfs_for_merging.sh b/workflows/WES-somatic/scripts/process_vcfs_for_merging.sh
new file mode 100644
index 000000000..eec881e3d
--- /dev/null
+++ b/workflows/WES-somatic/scripts/process_vcfs_for_merging.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+"""
+This script take in a vcf output from four different somatic variant callers:
+strelka, somaticsniper, mutect2, and varscan. It normalizes and left aligns the
+calls from each caller, and splits out multiallelic calls. It also prepends the
+caller name to the INFO and FORMAT fields for each variant caller.
+"""
+
+
+# Normalize and prepend FORMAT and INFO fields for Varscan VCF
+(
+bcftools norm 
+-f snakemake.input[4]
+--multiallelics -both
+snakemake.input[0]
+| sed "s/INFO\tFORMAT/VARSCAN_INFO\tVARSCAN_FORMAT/g" 
+> snakemake.output[0]
+) 
+
+# Normalize and prepend FORMAT and INFO fields for Strelka VCF
+(
+bcftools norm 
+-f snakemake.input[4]
+--multiallelics -both
+snakemake.input[1]
+| sed "s/INFO\tFORMAT/STRELKA_INFO\tSTRELKA_FORMAT/g"
+> snakemake.output[1]
+)
+
+#Normalize and prepend FORMAT and INFO fields for  SomaticSniper VCF
+(
+bcftools norm 
+-f snakemake.input[4]
+--multiallelics -both
+snakemake.input[2]
+| sed "s/INFO\tFORMAT/SOMATICSNIPER_INFO\tSOMATICSNIPER_FORMAT/g"
+> snakemake.output[2]
+)
+
+#Normalize and prepend FORMAT and INFO fields for Mutect2 VCF
+(
+bcftools norm 
+-f snakemake.input[4]
+--multiallelics -both
+snakemake.input[3]
+| sed "s/INFO\tFORMAT/MUTECT2_INFO\tMUTECT2_FORMAT/g"
+> snakemake.output[3]
+)
+