diff --git a/.circleci/config.yml b/.circleci/config.yml index da38e059..3f9de2da 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -152,6 +152,7 @@ variables: cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh cp $ORIG/workflows/references/run_test.sh $DEPLOY/workflows/references/run_test.sh cp $ORIG/workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization/run_test.sh + cp $ORIG/workflows/variant-calling/run_test.sh $DEPLOY/workflows/variant-calling/run_test.sh mkdir $DEPLOY/ci mkdir $DEPLOY/test @@ -239,8 +240,14 @@ variables: command: | source /opt/mambaforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV + # RNA-seq/ChIP-seq references $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -r -k --orig $ORIG + # Variant-calling references + $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=$ORIG/test/test_configs/variant-calling.yaml -j2 -p -r -k --orig $ORIG + + + # -------------------------------------------------------------------------- # Standard RNA-seq workflow rnaseq-step: &rnaseq-step @@ -303,6 +310,21 @@ variables: conda activate $LCDBWF_ENV $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -r -p -j2 --use-conda --orig $ORIG + + # -------------------------------------------------------------------------- + # Variant-calling workflow + variantcalling-step: &variantcalling-step + run: + name: variantcalling workflow + command: | + cd $DEPLOY + source /opt/mambaforge/etc/profile.d/conda.sh + conda activate $LCDBWF_ENV + $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow -n + $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow --use-conda -j2 + + tar -zcf /tmp/variantcalling.tar.gz workflows/variant-calling/results + # -------------------------------------------------------------------------- # Syntax note: All of the steps above, with their "&step-name" labels, can be # referred to by a corresponding "*step-name" below. The "<<: *defaults" @@ -401,6 +423,18 @@ jobs: destination: gene-patterns.html + variantcalling: + <<: *defaults + steps: + - checkout + - *restore_cache + - *set-path + - *get-data + - *variantcalling-step + - store_artifacts: + path: /tmp/variantcalling.tar.gz + destination: variantcalling.tar.gz + rnaseq-misc: <<: *defaults steps: @@ -501,13 +535,16 @@ workflows: - initial-setup - pytest - references: - requires: - - initial-setup - - pytest + requires: + - initial-setup + - pytest - colocalization: requires: - initial-setup - pytest + - variantcalling: + requires: + - initial-setup - build-docs: requires: - initial-setup @@ -517,5 +554,6 @@ workflows: - rnaseq-misc - chipseq - chipseq-misc - - references + # - references - colocalization + - variantcalling diff --git a/deploy.py b/deploy.py index 7ad7e1ac..f19f08ec 100755 --- a/deploy.py +++ b/deploy.py @@ -93,6 +93,10 @@ def write_include_file(source, flavor='all'): 'recursive-include workflows/chipseq/config *', 'include workflows/chipseq/chipseq_trackhub.py', ], + 'variant-calling': [ + 'include workflows/variant-calling/Snakefile', + 'recursive-include workflows/variant-calling/config *', + ], 'all': [ 'recursive-include wrappers *', 'recursive-include include *', @@ -110,6 +114,7 @@ def write_include_file(source, flavor='all'): 'recursive-include workflows/external *', ] + } patterns = [] @@ -117,6 +122,8 @@ def write_include_file(source, flavor='all'): patterns.extend(PATTERN_DICT['rnaseq']) if flavor in ('full', 'chipseq'): patterns.extend(PATTERN_DICT['chipseq']) + if flavor in ('full', 'variant-calling'): + patterns.extend(PATTERN_DICT['variant-calling']) if flavor == 'full': patterns.extend(PATTERN_DICT['full']) patterns.extend(PATTERN_DICT['all']) @@ -328,7 +335,7 @@ def build_envs(dest, conda_frontend="mamba"): ap.add_argument( "--flavor", default="full", - help="""Options are {0}. Default is full.""".format(['full', 'rnaseq', 'chipseq']), + help="""Options are {0}. Default is full.""".format(['full', 'rnaseq', 'chipseq', 'variant-calling']), ) ap.add_argument( "--dest", help="""Destination directory in which to copy files""", required=True diff --git a/docs/toc.rst b/docs/toc.rst index 1180c8cd..0a1cbb5f 100644 --- a/docs/toc.rst +++ b/docs/toc.rst @@ -13,6 +13,7 @@ Table of Contents rnaseq downstream-rnaseq chipseq + variant-calling integrative conda tests diff --git a/env.yml b/env.yml index 5b656720..43ecfd29 100644 --- a/env.yml +++ b/env.yml @@ -5,7 +5,7 @@ dependencies: - _libgcc_mutex=0.1 - _openmp_mutex=4.5 - _r-mutex=1.0.1 - - alsa-lib=1.2.3.2 + - alsa-lib=1.2.8 - amply=0.1.5 - appdirs=1.4.4 - argcomplete=3.0.8 @@ -16,20 +16,20 @@ dependencies: - backcall=0.2.0 - backports=1.0 - backports.functools_lru_cache=1.6.4 + - bcftools=1.17 - bedtools=2.31.0 - - binutils_impl_linux-64=2.39 - - binutils_linux-64=2.39 + - binutils_impl_linux-64=2.40 - biopython=1.81 - - boost-cpp=1.74.0 + - boost-cpp=1.78.0 - bowtie=1.3.1 - bowtie2=2.5.1 - brotli=1.0.9 - brotli-bin=1.0.9 - - brotlipy=0.7.0 + - bwa=0.7.17 - bwidget=1.9.14 - bx-python=0.9.0 - bzip2=1.0.8 - - c-ares=1.18.1 + - c-ares=1.19.1 - ca-certificates=2023.5.7 - cairo=1.16.0 - certifi=2023.5.7 @@ -48,8 +48,7 @@ dependencies: - configargparse=1.5.3 - connection_pool=0.0.3 - contourpy=1.0.7 - - cryptography=39.0.0 - - curl=7.86.0 + - curl=7.87.0 - cutadapt=4.4 - cycler=0.11.0 - datrie=0.8.2 @@ -59,7 +58,7 @@ dependencies: - deeptoolsintervals=0.1.9 - dnaio=0.10.0 - docutils=0.20.1 - - dpath=2.1.5 + - dpath=2.1.6 - epic2=0.0.52 - exceptiongroup=1.1.1 - execnet=1.9.0 @@ -81,31 +80,30 @@ dependencies: - fribidi=1.0.10 - future=0.18.3 - gat=1.3.6 - - gcc_impl_linux-64=10.4.0 - - gcc_linux-64=10.4.0 + - gatk4=4.4.0.0 + - gcc_impl_linux-64=12.2.0 - gettext=0.21.1 - gffread=0.12.7 - gffutils=0.11.1 - - gfortran_impl_linux-64=10.4.0 - - gfortran_linux-64=10.4.0 + - gfortran_impl_linux-64=12.2.0 - giflib=5.2.1 - gitdb=4.0.10 - gitpython=3.1.31 - - glib=2.74.1 - - glib-tools=2.74.1 + - glib=2.76.3 + - glib-tools=2.76.3 - gmp=6.2.1 - graphite2=1.3.13 - gsl=2.7 - - gst-plugins-base=1.18.5 - - gstreamer=1.20.3 - - gxx_impl_linux-64=10.4.0 - - gxx_linux-64=10.4.0 - - harfbuzz=4.2.0 + - gst-plugins-base=1.21.3 + - gstreamer=1.21.3 + - gstreamer-orc=0.4.33 + - gxx_impl_linux-64=12.2.0 + - harfbuzz=6.0.0 - hdf5=1.12.1 - hisat2=2.2.1 - - htslib=1.16 + - htslib=1.17 - humanfriendly=10.0 - - icu=69.1 + - icu=70.1 - idna=3.4 - importlib-metadata=6.6.0 - importlib_resources=5.12.0 @@ -113,7 +111,7 @@ dependencies: - intervalstats=1.01 - ipython=8.13.2 - isa-l=2.30.0 - - jack=1.9.18 + - jack=1.9.22 - jedi=0.18.2 - jinja2=3.1.2 - jpeg=9e @@ -123,19 +121,21 @@ dependencies: - kernel-headers_linux-64=2.6.32 - keyutils=1.6.1 - kiwisolver=1.4.4 - - krb5=1.19.3 + - krb5=1.20.1 + - lame=3.100 - lcms2=2.14 - - ld_impl_linux-64=2.39 + - ld_impl_linux-64=2.40 - lerc=4.0.0 - libblas=3.9.0 - libbrotlicommon=1.0.9 - libbrotlidec=1.0.9 - libbrotlienc=1.0.9 - - libcap=2.64 + - libcap=2.66 - libcblas=3.9.0 - - libclang=13.0.1 + - libclang=15.0.7 + - libclang13=15.0.7 - libcups=2.3.3 - - libcurl=7.86.0 + - libcurl=7.87.0 - libdb=6.2.32 - libdeflate=1.13 - libedit=3.1.20191231 @@ -143,33 +143,36 @@ dependencies: - libevent=2.1.10 - libexpat=2.5.0 - libffi=3.4.2 - - libflac=1.3.4 - - libgcc-devel_linux-64=10.4.0 + - libflac=1.4.2 + - libgcc-devel_linux-64=12.2.0 - libgcc-ng=12.2.0 + - libgcrypt=1.10.1 - libgd=2.3.3 - libgfortran-ng=12.2.0 - libgfortran5=12.2.0 - - libglib=2.74.1 + - libglib=2.76.3 - libgomp=12.2.0 - - libhwloc=2.8.0 + - libgpg-error=1.46 + - libhwloc=2.9.1 - libiconv=1.17 - libjemalloc=5.3.0 - liblapack=3.9.0 - liblapacke=3.9.0 - - libllvm13=13.0.1 + - libllvm15=15.0.7 - libnghttp2=1.51.0 - libnsl=2.0.0 - libogg=1.3.4 - libopenblas=0.3.21 - libopus=1.3.1 - libpng=1.6.39 - - libpq=14.5 - - libsanitizer=10.4.0 - - libsndfile=1.0.31 - - libsqlite=3.41.2 + - libpq=15.1 + - libsanitizer=12.2.0 + - libsndfile=1.2.0 + - libsqlite=3.42.0 - libssh2=1.10.0 - - libstdcxx-devel_linux-64=10.4.0 + - libstdcxx-devel_linux-64=12.2.0 - libstdcxx-ng=12.2.0 + - libsystemd0=252 - libtiff=4.4.0 - libtool=2.4.7 - libudev1=253 @@ -178,9 +181,10 @@ dependencies: - libwebp=1.2.4 - libwebp-base=1.2.4 - libxcb=1.13 - - libxkbcommon=1.0.3 - - libxml2=2.9.14 + - libxkbcommon=1.5.0 + - libxml2=2.10.3 - libzlib=1.2.13 + - lz4-c=1.9.4 - lzo=2.10 - lzstring=1.0.4 - make=4.3 @@ -191,6 +195,7 @@ dependencies: - matplotlib-base=3.7.1 - matplotlib-inline=0.1.6 - mdurl=0.1.0 + - mpg123=1.31.3 - multiqc=1.14 - munkres=1.1.4 - mysql-common=8.0.32 @@ -198,62 +203,26 @@ dependencies: - mysql-libs=8.0.32 - natsort=8.4.0 - nbformat=5.8.0 - - ncbi-vdb=3.0.2 - ncurses=6.3 - networkx=3.1 - nspr=4.35 - nss=3.89 - numpy=1.23.5 - - openjdk=11.0.1 + - openjdk=17.0.3 - openjpeg=2.5.0 - openssl=1.1.1t - - ossuuid=1.6.2 - packaging=23.1 - pandas=2.0.1 - pandoc=3.1.2 - - pango=1.50.7 + - pango=1.50.14 - parso=0.8.3 - patsy=0.5.3 - pbzip2=1.1.13 - - pcre2=10.37 + - pcre2=10.40 - perl=5.32.1 - - perl-alien-build=2.48 - - perl-alien-libxml2=0.17 - - perl-business-isbn=3.007 - - perl-business-isbn-data=20210112.006 - - perl-capture-tiny=0.48 - - perl-carp=1.50 - - perl-constant=1.33 - - perl-data-dumper=2.183 - - perl-encode=3.19 - - perl-exporter=5.74 - - perl-extutils-makemaker=7.70 - - perl-ffi-checklib=0.28 - - perl-file-chdir=0.1011 - - perl-file-path=2.18 - - perl-file-temp=0.2304 - - perl-file-which=1.24 - perl-gd=2.76 - perl-gdgraph=1.54 - perl-gdtextutil=0.86 - - perl-importer=0.026 - - perl-mime-base64=3.16 - - perl-parent=0.241 - - perl-path-tiny=0.124 - - perl-pathtools=3.75 - - perl-scope-guard=0.21 - - perl-storable=3.15 - - perl-sub-info=0.002 - - perl-term-table=0.016 - - perl-test-fatal=0.016 - - perl-test-warnings=0.031 - - perl-test2-suite=0.000145 - - perl-try-tiny=0.31 - - perl-uri=5.17 - - perl-xml-libxml=2.0207 - - perl-xml-namespacesupport=1.12 - - perl-xml-sax=1.02 - - perl-xml-sax-base=1.09 - pexpect=4.8.0 - picard=2.27.5 - pickleshare=0.7.5 @@ -266,6 +235,7 @@ dependencies: - platformdirs=3.5.1 - plotly=5.14.1 - pluggy=1.0.0 + - ply=3.11 - pooch=1.7.0 - preseq=3.2.0 - prompt-toolkit=3.0.38 @@ -274,7 +244,7 @@ dependencies: - pthread-stubs=0.4 - ptyprocess=0.7.0 - pulp=2.7.0 - - pulseaudio=14.0 + - pulseaudio=16.1 - pure_eval=0.2.2 - py2bit=0.3.0 - pybedtools=0.9.0 @@ -282,57 +252,59 @@ dependencies: - pycparser=2.21 - pyfaidx=0.7.2.1 - pygments=2.15.1 - - pyopenssl=23.1.1 - pyparsing=3.0.9 - - pyqt=5.15.4 - - pyqt5-sip=12.9.0 + - pyqt=5.15.7 + - pyqt5-sip=12.11.0 - pyrsistent=0.19.3 - - pysam=0.20.0 + - pysam=0.21.0 - pysocks=1.7.1 - pytest=7.3.1 - - pytest-xdist=3.2.1 + - pytest-xdist=3.3.1 - python=3.10.8 - python-dateutil=2.8.2 - - python-fastjsonschema=2.16.3 + - python-fastjsonschema=2.17.1 - python-isal=1.1.0 - - python-lzo=1.14 + - python-lzo=1.15 - python-tzdata=2023.3 - python_abi=3.10 - pytz=2023.3 - pyvcf3=1.0.3 - pyyaml=6.0 - - qt-main=5.15.2 - - r-base=4.1.3 + - qt-main=5.15.6 + - r-base=4.2.2 - readline=8.2 - - requests=2.29.0 + - requests=2.31.0 - reretry=0.11.8 - rich=13.3.5 - rich-click=1.6.1 - rseqc=5.0.1 + - rust-bio-tools=0.42.0 - salmon=1.10.1 - - samtools=1.16.1 + - samtools=1.17 - scipy=1.10.1 - seaborn=0.12.2 - seaborn-base=0.12.2 - sed=4.8 - setuptools=67.7.2 - simplejson=3.19.1 - - sip=6.5.1 + - sip=6.7.9 - six=1.16.0 - smart_open=6.3.0 - smmap=3.0.5 - - snakemake-minimal=7.25.3 + - snakemake-minimal=7.26.0 + - snpeff=5.1 + - snpsift=5.1 - spectra=0.0.11 - - sqlite=3.41.2 - - sra-tools=3.0.3 + - sra-tools=2.9.6 - stack_data=0.6.2 - star=2.7.10b + - starcode=1.4 - statsmodels=0.14.0 - stopit=1.1.2 - subread=2.0.3 - sysroot_linux-64=2.12 - tabulate=0.9.0 - - tbb=2021.7.0 + - tbb=2021.9.0 - tenacity=8.2.2 - throttler=1.2.1 - tk=8.6.12 @@ -343,35 +315,47 @@ dependencies: - tornado=6.3.2 - trackhub=0.2.4 - traitlets=5.9.0 - - typing-extensions=4.5.0 - - typing_extensions=4.5.0 + - typing-extensions=4.6.1 + - typing_extensions=4.6.1 - tzdata=2023c - - ucsc-bedgraphtobigwig=377 + - ucsc-bedgraphtobigwig=445 - ucsc-bedsort=377 - - ucsc-bedtobigbed=377 + - ucsc-bedtobigbed=447 - ucsc-bigwigmerge=377 - ucsc-fetchchromsizes=377 - - ucsc-genepredtobed=377 - - ucsc-gtftogenepred=377 - - ucsc-liftover=377 + - ucsc-genepredtobed=447 + - ucsc-gtftogenepred=447 + - ucsc-liftover=447 - ucsc-oligomatch=377 - - ucsc-twobittofa=377 - - ucsc-wigtobigwig=377 + - ucsc-twobittofa=447 + - ucsc-wigtobigwig=447 - unicodedata2=15.0.0 - - urllib3=1.26.15 + - urllib3=2.0.2 - wcwidth=0.2.6 - wheel=0.40.0 - wrapt=1.15.0 + - xcb-util=0.4.0 + - xcb-util-image=0.4.0 + - xcb-util-keysyms=0.4.0 + - xcb-util-renderutil=0.3.9 + - xcb-util-wm=0.4.1 + - xkeyboard-config=2.38 - xopen=1.7.0 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 - xorg-kbproto=1.0.7 - xorg-libice=1.0.10 - xorg-libsm=1.2.3 - xorg-libx11=1.8.4 - - xorg-libxau=1.0.9 + - xorg-libxau=1.0.11 - xorg-libxdmcp=1.1.3 - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 - xorg-libxrender=0.9.10 - xorg-libxt=1.2.1 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 - xorg-renderproto=0.11.1 - xorg-xextproto=7.3.0 - xorg-xproto=7.0.31 diff --git a/include/requirements.txt b/include/requirements.txt index 6001f6d5..188dbe28 100644 --- a/include/requirements.txt +++ b/include/requirements.txt @@ -1,13 +1,17 @@ +bcftools>=1.15.1 bedtools biopython bowtie bowtie2 +bwa +curl cutadapt>=3.0 deeptools fastq-screen fastqc font-ttf-dejavu-sans-mono gat +gatk4 gffread gffutils hisat2 @@ -29,13 +33,15 @@ pytest pytest-xdist python>=3.10 rseqc +rust-bio-tools # earlier versions of salmon can segfault on Slurm salmon>=1.10.1 - samtools seaborn snakemake-minimal +snpeff +snpsift sra-tools star subread diff --git a/lib/aligners.py b/lib/aligners.py index 62fe58a5..629a3eb1 100644 --- a/lib/aligners.py +++ b/lib/aligners.py @@ -83,3 +83,28 @@ def fastq_arg_from_input(fastqs): fastqs = '-1 {0} -2 {1} '.format(*fastqs) return fastqs +def bwa_index_from_prefix(prefix): + """ + Given a prefix, return a list of the corresponding bwa index files + """ + ext_list = ["amb", "ann", "bwt", "pac", "sa"] + return ['{prefix}.{ext}'.format(prefix=prefix, ext=ext_list[i]) for i in range(len(ext_list))] + +def bwa_prefix_from_index(index_files): + """ + Given a list of index files, return the corresponding prefix + """ + if isinstance(index_files, str): + return '.'.join(index_files.split('.')[:-1]) + else: + prefixes = list( + set( + map( + lambda x: '.'.join(x.split('.')[:-1]), index_files) + ) + ) + if len(prefixes) != 1: + raise ValueError( + "More than one prefix detected from '{0}'".format(prefixes) + ) + return prefixes[0] diff --git a/lib/common.py b/lib/common.py index 829cc129..7ed03cc7 100644 --- a/lib/common.py +++ b/lib/common.py @@ -419,6 +419,7 @@ def references_dict(config): 'bowtie2': aligners.bowtie2_index_from_prefix('')[0], 'hisat2': aligners.hisat2_index_from_prefix('')[0], 'star': '/Genome', + 'bwa': aligners.bwa_index_from_prefix('')[0], # Notes on salmon indexing: # - pre-1.0 versions had hash.bin @@ -451,13 +452,39 @@ def references_dict(config): type_extensions = { 'genome': 'fasta', 'annotation': 'gtf', - 'transcriptome': 'fasta' + 'transcriptome': 'fasta', + 'known': 'vcf.gz' } for organism in merged_references.keys(): d[organism] = {} for tag in merged_references[organism].keys(): e = {} + if tag == 'variation': + # variation databases should be the the keys of a dictionary + # containing a URL and postprocess block + for type_ in merged_references[organism][tag].keys(): + ext = '.vcf.gz' + if type_ == 'dbnsfp': + type_ = merged_references[organism][tag][type_]['version'] + '_' + merged_references[organism][tag][type_]['build'] + e[type_] = ( + '{references_dir}/' + '{organism}/' + '{tag}/' + '{type_}/' + '{organism}_{tag}{ext}'.format(**locals()) + ) + d[organism][tag] = e + continue + e[type_] = ( + '{references_dir}/' + '{organism}/' + '{tag}/' + '{type_}/' + '{organism}_{tag}{ext}'.format(**locals()) + ) + d[organism][tag] = e + continue for type_, block in merged_references[organism][tag].items(): if type_ == 'metadata': continue @@ -537,7 +564,8 @@ def references_dict(config): .format(**locals()) ) - # Only makes sense to have chromsizes for genome fasta, not transcriptome. + # Only makes sense to have chromsizes and faidx for genome + # fasta, not transcriptome. if type_ == 'genome': e['chromsizes'] = ( '{references_dir}/' @@ -546,6 +574,16 @@ def references_dict(config): '{type_}/' '{organism}_{tag}.chromsizes'.format(**locals()) ) + e['faidx'] = ( + '{references_dir}/' + '{organism}/' + '{tag}/' + '{type_}/' + '{organism}_{tag}.fai'.format(**locals()) + ) + + + d[organism][tag] = e return d, conversion_kwargs @@ -912,3 +950,4 @@ def gff2gtf(gff, gtf): shell('gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}') else: shell('gffread {gff} -T -o- | gzip -c > {gtf}') + diff --git a/lib/helpers.smk b/lib/helpers.smk new file mode 100644 index 00000000..8c146ba8 --- /dev/null +++ b/lib/helpers.smk @@ -0,0 +1,387 @@ +import pandas as pd +import yaml +import os + +# Read sample table +samples = pd.read_table(config["samples"], dtype=str).set_index("sample", drop=False) +units = pd.read_table(config["units"], dtype=str).set_index(["sample","unit"], drop=False) +units.index = units.index.set_levels([i for i in units.index.levels]) + +def preflight(): + """ + This helper function gets called at the top of the main Snakefile. It + handles reading the config to see if references are provided externally, or + if we are relying on lcdb-wf references. Returns variables containing + filepaths of references to be used in rules. It will also perform some + checks to make sure the config is not contradicting itself under certain + configurations. + """ + aln_index = [] + dbnsfp = [] + dictionary = [] + indexed = [] + known_sites = [] + reference = [] + + # Handle reference names if LCDB-WF References is ran + if config['ref']['use_references_workflow']: + include: '../references/Snakefile' + refdict = common.references_dict(config) + reference = refdict[config['ref']['organism'][config['ref']['genome']['tag']]['genome']] + aln = refdict[config['ref']['organism'][config['ref']['aligner']['tag']]['bwa']] + aln_index = multiext(os.path.splitext(aln)[0], ".amb", ".ann", ".bwt", ".pac", ".sa") + indexed = refdict[config['ref']['organism'][config['ref']['faidx']['tag']]['faidx']] + if config['ref']['variation']['dbnsfp']: + # The config can supply a path to a local file in the variation slots + if config['ref']['variation']['dbnsfp'].startswith('/'): + dbnsfp = config['ref']['variation']['dbnsfp'] + else: + dbnsfp = refdict[config['ref']['organism']]['variation'][str( + config['ref']['variation']['dbnsfp'] + '_' + config['ref']['genome']['build'] + )] + else: + dbnsfp = [] + if config['ref']['variation']['known']: + if config['ref']['variation']['known'].startswith('/'): + known_sites = config['ref']['variation']['known'] + else: + known_sites = refdict[config['ref']['organism']][config['ref']['genome']['tag']][config['ref']['variation']['known']] + else: + known_sites = [] + else: + known_sites = ( + config['ref']['paths']['known'] + if config['ref']['paths']['known'] + else [] + ) + reference = config['ref']['paths']['ref'] + indexed = ( + config['ref']['paths']['index'] + if config['ref']['paths']['index'] + else reference + '.fai' + ) + dbnsfp = ( + config['ref']['paths']['dbnsfp'] + if config['ref']['paths']['dbnsfp'] + else [] + ) + aln_index = [] + + # Handle dictionary name, stop the workflow if the fasta file is not named properly. Stop the workflow if there is no reference + if reference == []: + raise ValueError("You must supply a reference file to workflow.") + if reference.endswith('.gz'): + dictionary = '.'.join(reference.split('.')[:-2]) + '.dict' + else: + try: + dictionary ='.'.join(reference.split('.')[:-1]) + '.dict' + # If there is no exception, python will raise a TypeError trying to concatenate an empty list with str + except TypeError: + raise ValueError("There is something wrong with your reference extension. " + "Please make sure your reference has an extension") + # Stop the workflow easily if there is no known variation, but bqsr is set in the config + if config['filtering']['bqsr'] == True: + assert known_sites != [], 'Check your config.yaml. You are requiring that bqsr be run, but there is no known sites vcf' + + return aln_index, dbnsfp, dictionary, indexed, known_sites, reference + + +def get_contigs(): + """ + Helper function to read the contigs from the fasta index checkpoint rule. + These contigs define the regions to split variant calling by for joint-calling. + """ + with checkpoints.genome_index.get().output[0].open() as fai: + ser = pd.read_table(fai, header=None, usecols=[0], dtype=str) + ser = ser.squeeze() + # TODO: make this less brittle, and better support non-Ensembl organisms + # Remove all contigs that don't correspond to a chromosome + ser = ser[ser.apply(lambda x: len(x)) <= 2] + # Remove mitochondiral if specified in the config + if config["processing"]["remove-mitochondrial"]: + return ser[ser != "MT"] + else: + return ser + + +def get_fastq(wildcards): + """ + Get fastq files of given sample-unit. Sample-unit structure is how technical replicates + are handled. This is defined in the sampletable. + """ + fastqs = units.loc[(wildcards.sample, wildcards.unit), ["fq1", "fq2"]].dropna() + if len(fastqs) == 2: + return {"r1": fastqs.fq1, "r2": fastqs.fq2} + return {"r1": fastqs.fq1} + + +def get_read_group(wildcards): + """Denote sample name and platform in read group.""" + return "-R '@RG\\tID:{sample}\\tSM:{sample}\\tPL:{platform}'".format( + sample=wildcards.sample, + platform=units.loc[(wildcards.sample, wildcards.unit), "platform"], + ) + + +def get_recal_input(bai=False): + """ + Handle providing bams the input of the bqsr rules. + Read config options to determine the appropriate bam and bam index files. + If we don't remove duplicates, return the sorted bams from map reads rule. + If duplicates are removed, return the deduplicated bams from the mark duplicates rule. + If a bed file is used in variant calling + """ + # Case 1: no duplicate removal + f = "results/mapped/{sample}-{unit}.sorted.bam" + if config["processing"]["remove-duplicates"]: + # Case 2: remove duplicates + f = "results/dedup/{sample}-{unit}.bam" + if bai: + if config["processing"]["restrict-regions"]: + # Case 3: need an index because random access is required + f += ".bai" + return f + else: + # Case 4: no index needed + return [] + else: + return f + + +def get_sample_bams(wildcards): + """ + Get all aligned reads of given sample. Return the recal bams if bqsr is run + otherwise return the dedup bams. We return all units for a given sample because + we want to provide technical replicates to the variant calling rule where this is called + """ + unitlist = units.loc[wildcards.sample].unit + reslist = [] + if config['filtering']['bqsr']: + reslist.extend( + [ + "results/recal/{}-{}.bam".format(wildcards.sample, unit) for unit in unitlist + ] + ) + else: + reslist.extend( + [ + "results/dedup/{}-{}.bam".format(wildcards.sample, unit) for unit in unitlist + ] + ) + + return reslist + + +def get_sample_unit_bams(wildcards): + """ + Get all aligned reads of given sample. Unlike the function above, we return a single sample-unit combination per function call. + This is because this function is used to QC rules like samtools-stats where we do not want to combine technical replicates. + Return the recal bams if bqsr is run otherwise return the dedup bams + """ + reslist = '' + if config['filtering']['bqsr']: + reslist = "results/recal/{sample}-{unit}.bam".format(sample=wildcards.sample, unit=wildcards.unit) + else: + reslist = "results/dedup/{sample}-{unit}.bam".format(sample=wildcards.sample, unit=wildcards.unit) + return reslist + + +def get_regions_param(regions=config["processing"]["restrict-regions"], default=""): + """ + If a captured regions bedfile is present, split the variant calling up into regions + follwing GATK best practices + """ + if regions: + params = "--intervals '{}' ".format(regions) + padding = config["processing"].get("region-padding") + if padding: + params += "--interval-padding {}".format(padding) + return params + return default + + +def get_call_variants_params(wildcards, input): + """ + Calls the previous function to assemble the regions into interval lists + along with any specified parameters for variant calling in the config + """ + return ( + get_regions_param( + regions=input.regions, default="--intervals {}".format(wildcards.contig) + ) + ) + + +def set_java_opts(resources): + """ + Using the resources directive from the snakemake rule + set the heap size. Request 75 percent of the requested + mem_mb. The remaining 25 percent should be enough for + OS and other system processes that occur outside the shell command + """ + heap = int(resources.mem_mb * 0.75) + heap = int(heap / 1024) + java_temp ='''"-Xmx{}g -Djava.io.tmpdir=$TMPDIR\"''' + java_opts = java_temp.format(heap) + return java_opts + +def all_input_mutect2(): + """ + Format the input for the all rule for mutect2 + """ + comparisons = config['mutect2'].keys() + return expand("results/mutect2_annotated_normed/{comp}.vcf.gz", comp=comparisons) + + +def names_for_somatic(wildcards): + """ + Format the names into arguments to pass to mutect2. + Mutect2 requires you to specify the names of the "normal" samples. + There can be multiple normal samples in a single mutect2 call. + Tumor samples do not need to be named. This will be done by reading + from the config. + """ + comp = wildcards.comp + normals = config['mutect2'][comp]['normal'] + if not isinstance(normals, list): + normals = [normals] + return normals + + +def input_for_somatic(wildcards): + """ + Format the bam input for mutect2 by reading from the config. + Technical replicates are separated and grouped. Returns a dictionary + contains the reference genome, sequence dictionary, and input bams + """ + comp = wildcards.comp + normals = config['mutect2'][comp]['normal'] + if not isinstance(normals, list): + normals = [normals] + tumors = config['mutect2'][comp]['tumor'] + if not isinstance(tumors, list): + tumors = [tumors] + # Fill these lists with paths to tumor and normal files + t_files = [] + n_files = [] + for i in range(len(tumors)): + # Get the unit for each tumor sample + unitlist = units.loc[tumors[i]].unit + if config['filtering']['bqsr']: + t_files.extend( + [ + "results/recal/{}-{}.bam".format(tumors[i], unit) for unit in unitlist + ] + ) + else: + t_files.extend( + [ + "results/dedup/{}-{}.bam".format(tumors[i], unit) for unit in unitlist + ] + ) + # Do the same for Normals + for i in range(len(normals)): + unitlist = units.loc[normals[i]].unit + if config['filtering']['bqsr']: + n_files.extend( + [ + "results/recal/{}-{}.bam".format(normals[i], unit) for unit in unitlist + ] + ) + else: + n_files.extend( + [ + "results/dedup/{}-{}.bam".format(normals[i], unit) for unit in unitlist + ] + ) + + + # Put all the input files needed into a dictionary to pass to the rule + d = dict( + ref=reference, + normals=n_files, + tumors=t_files, + dict=dictionary, + regions=( + "results/called/{contig}.regions.bed".format(contig = wildcards.contig) + if config["processing"]["restrict-regions"] + else [] + ), + ) + return d + + +def get_fai_nomenclature(): + """ + Helper function to get the nomenclature of the fasta index + Returns True if the chr prefix is present, and False if it is absent + """ + nom = False + with checkpoints.genome_index.get().output[0].open() as fai: + for line in fai: + if line.startswith('chr'): + nom = True + break + return nom + + +def get_bed_nomenclature(input): + """ + Helper function to get the nomenclature of the bedfile + Returns True if the chr prefix is present, and False if it is absent + """ + nom = False + with open(input.bed, 'r') as f: + for line in f: + if line.startswith('browser'): + continue + if line.startswith('track'): + continue + if line.startswith('chr'): + nom = True + break + return nom + + +def snpeff_input(wildcards): + """ + Ensure all correct input files for snpEff rule are available + """ + dbnsfp = ( + config['ref']['paths']['dbnsfp'] + if config['ref']['paths']['dbnsfp'] + else [] + ) + dbnsfp_tbi = dbnsfp + '.tbi' + vcf = 'results/filtered/all.normed.vcf.gz' + + # make dictionary containing the required snpEff input files + d = dict( + dbnsfp=dbnsfp, + dbnsfp_tbi=dbnsfp_tbi, + vcf=vcf, + ) + return d + + +def snpeff_cancer_input(wildcards): + """ + Ensure all correct input files for snpEff cancer rule are available + """ + dbnsfp = ( + config['ref']['paths']['dbnsfp'] + if config['ref']['paths']['dbnsfp'] + else [] + ) + dbnsfp_tbi = dbnsfp + '.tbi' + vcf = 'results/somatic_filtered/normed.{comp}.vcf.gz'.format(comp = wildcards.comp) + + # make dictionary containing the required snpEff cancer input files + d = dict( + dbnsfp=dbnsfp, + dbnsfp_tbi=dbnsfp_tbi, + vcf=vcf, + ) + return d + +# vim: ft=python diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test index 1edb2576..3e228302 100755 --- a/test/lcdb-wf-test +++ b/test/lcdb-wf-test @@ -105,6 +105,8 @@ class Runner(object): %(prog)s rnaseq --downstream %(prog)s chipseq --run-workflow %(prog)s references --run-workflow --configfile=config/config.yaml + %(prog)s variantcalling --run-workflow + DATA ---- @@ -180,7 +182,7 @@ class Runner(object): parser.add_argument( "--kind", default="all", - choices=["all", "rnaseq", "chipseq"], + choices=["all", "rnaseq", "chipseq", "variantcalling"], help="Kind of data to download", ) parser.add_argument( @@ -194,8 +196,22 @@ class Runner(object): args = parser.parse_args(sys.argv[2:]) - repo = "lcdb-test-data" - URL = f"https://github.com/lcdb/{repo}/blob/{args.branch}/data/{{}}?raw=true" + # Create a repo lookup for the different assays + # For variantcalling, the `args.branch` should be "main" instead of "master", unless we can fix this + repo_lookup = { + 'rnaseq': { + 'repo': "lcdb-test-data", + 'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true" + }, + 'chipseq': { + 'repo': "lcdb-test-data", + 'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true" + }, + 'variantcalling': { + 'repo': 'lcdb-wf-variant-calling-test-data', + 'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true" + } + } # This dict maps files in the `data` directory of test-data repo to # a local path to which it should be downloaded, as expected by the @@ -269,6 +285,41 @@ class Runner(object): "workflows/chipseq/data/example_data/chipseq_ip4.fq.gz", ), ], + "variantcalling": [ + ( + "GRCh38.6.20.fa.gz", + "workflows/variant-calling/references/GRCh38.6.20.fa.gz", + ), + ( + "known_variation_noiupac.vcf.gz", + "workflows/variant-calling/references/known_variation_noiupac.vcf.gz" + + ), + ( + "normal_R1.6.20.fq.gz", + "workflows/variant-calling/data/example_data/normal_R1.fq.gz" + ), + ( + "normal_R2.6.20.fq.gz", + "workflows/variant-calling/data/example_data/normal_R2.fq.gz" + ), + ( + "tumor_R1.6.20.fq.gz", + "workflows/variant-calling/data/example_data/tumor_R1.fq.gz" + ), + ( + "tumor_R2.6.20.fq.gz", + "workflows/variant-calling/data/example_data/tumor_R2.fq.gz" + ), + ( + "dbnsfp_6_20.vcf.gz", + "workflows/variant-calling/references/dbnsfp_6_20.vcf.gz" + ), + ( + "dbnsfp_6_20.vcf.gz.tbi", + "workflows/variant-calling/references/dbnsfp_6_20.vcf.gz.tbi" + ), + ] } if args.kind == "all": @@ -277,12 +328,12 @@ class Runner(object): kinds = [args.kind] for kind in kinds: for fn, dest in data_files[kind]: - url = URL.format(fn) + url = repo_lookup[kind]['URL'].format(fn, repo=repo_lookup[kind]['repo']) if args.verbose: print(f"downloading {url}") if dest is None: dest = fn - dest = Path(dest) + dest = Path(dest).resolve() dest.parent.mkdir(parents=True, exist_ok=True) sp.run( f"wget -q -O- {url} > {dest}", shell=True, check=True, cwd=TOPLEVEL @@ -575,6 +626,44 @@ class Runner(object): executable="/bin/bash" ) + def _cmd_variantcalling(self): + """ + This function handles the "variantcalling" subcommand. + """ + + parser = argparse.ArgumentParser( + description="Run variant calling workflow and downstream tests", + parents=[self.global_parser], + ) + parser.add_argument( + "--run-workflow", + action="store_true", + help="""Run variant workflow using run_tesh.sh, which runs preprocess.py + on the snakefile, converting it to a test file to be run.""", + ) + + + workflow_prefix = "bash run_test.sh" + workflow_dir = TOPLEVEL / "workflows/variant-calling" + args, extra = parser.parse_known_args(sys.argv[2:]) + + if args.run_workflow: + print(args) + extra = [i.replace("__ORIG__", args.orig) for i in extra] + strargs = " ".join(extra) + cmd = ( + 'eval "$(conda shell.bash hook)" ' + f"&& conda activate {args.env} " + f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})" + ) + print_header(f"Running the following command:\n{cmd}") + sp.run( + cmd, + check=True, + shell=True, + executable="/bin/bash" + ) + if __name__ == "__main__": Runner() diff --git a/test/test_configs/variant-calling.yaml b/test/test_configs/variant-calling.yaml new file mode 100644 index 00000000..d768eb05 --- /dev/null +++ b/test/test_configs/variant-calling.yaml @@ -0,0 +1,30 @@ +references_dir: "references" +references: + human: + ensembl-104: + metadata: + build: 'GRCh38' + release: 104 + species: 'homo_sapiens' + genome: + url: 'https://github.com/lcdb/lcdb-wf-variant-calling-test-data/raw/master/data/GRCh38.6.20.fa.gz' + # URL format is 'ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species_capitalized}.{build}.{datatype}.{assembly}.{suffix}' + # When using GRCh37, branch changes to "grch37/release-{release}" + # always use primary_assembly for human, NEVER use top_level for assembly for human + indexes: + - 'bwa' + known: + # You can download structural_variations, somatic, or "all" which corresponds to germline known variation for all chromosomes + type: 'all' + # Comment the variation key out if not requiring dbnsfp + #variation: + # Download of variation databases will be handled by a unique rule in the Snakefile + # ONLY include keys like 'dbnsfp' IF you intend to download them, comment out these keys if not. + #dbnsfp: + # The version of the database should be correctly formatted like this + #version: 'dbNSFPv4.4' + # The url is found by copying the link address of the latest version found here: https://sites.google.com/site/jpopgen/dbNSFP + #url: 'https://usf.box.com/shared/static/bvfzmkpgtphvbmmrvb2iyl2jl21o49kc' + # Match the build to the metadata block above + #build: 'GRCh38' + diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index d6bc9d0f..0f956d85 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -5,7 +5,8 @@ import gzip import yaml import importlib import tempfile -import pandas +from tempfile import TemporaryDirectory +import pandas as pd from snakemake.utils import makedirs from lib.imports import resolve_name from lib import utils @@ -61,6 +62,29 @@ rule unzip: '{references_dir}/logs/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}.log' shell: 'gunzip -c {input} > {output}' +rule bwa_index: + """ + Build bwa index + """ + input: + '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' + output: + protected(aligners.bwa_index_from_prefix('{references_dir}/{organism}/{tag}/genome/bwa/{organism}_{tag}')) + log: + '{references_dir}/logs/{organism}/{tag}/genome/bwa/{organism}_{tag}.log' + resources: + runtime=autobump(hours=3), + mem_mb=gb(24), + disk_mb=gb(24) + run: + prefix=aligners.bwa_prefix_from_index(output) + print(prefix) + shell( + 'bwa index ' + '-p {prefix} ' + '-a bwtsw ' + '{input} ' + '&> {log}') rule bowtie2_index: """ @@ -353,7 +377,7 @@ rule mappings: d['__featuretype__'] = ft res.append(d) - df = pandas.DataFrame(res) + df = pd.DataFrame(res) # Depending on how many attributes there were and the # include_featuretypes settings, this may take a while. @@ -364,4 +388,176 @@ rule mappings: # Restore original setting gffutils.constants.always_return_list = orig_setting + +checkpoint genome_index: + """ + Build fasta index. GATK uses this file for rapid fasta accession as well as + the fasta index is used to identify chromosomes and contigs in the genome + to download the appropriate known variation file + """ + input: + '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' + output: + protected('{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fai') + log: + '{references_dir}/logs/{organism}/{tag}/genome/{organism}_{tag}.fai.log' + resources: + runtime=hours(1), + mem_mb=gb(4) + run: + shell( + 'samtools ' + 'faidx ' + '-o {output} {input} ' + '&> {log}' + ) + + +rule known_variation: + """ + Download all the chromosomes on the ensembl ftp site and combine them to + create a known variation vcf + """ + input: + fai='{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fai' + output: + protected('{references_dir}/{organism}/{tag}/known/{organism}_{tag}.vcf.gz') + log: + '{references_dir}/{organism}/{tag}/known/{organism}_{tag}.known.log' + resources: + runtime=hours(4), + mem_mb=gb(16), + disk_mb=gb(32) + run: + release = int(config['references'][wildcards.organism][wildcards.tag]['metadata']['release']) + species = config['references'][wildcards.organism][wildcards.tag]['metadata']['species'] + build = config['references'][wildcards.organism][wildcards.tag]['metadata']['build'] + typ = config['references'][wildcards.organism][wildcards.tag]['known']['type'] + + # ---------------------------------------------------------------------- + # NOTE: species-specific configuration may be required + # ---------------------------------------------------------------------- + # + # Ensembl has many species available, but the code below is designed to + # work for human. + # + # For available species, see https://ftp.ensembl.org/pub/release-109/variation/vcf/ + # + + # Human-specific patching to deal with GRCh37 filenames and directory + # structure on Ensembl FTP + branch = "" + if release >= 81 and build == "GRCh37": + branch="grch37/" + if typ == "all": + suffixes = [""] + + # Starting in release 93, human germline VCFs are split by chrom + if species == "homo_sapiens" and release >= 93: + ser = ( + pd.read_table(input.fai, header=None, usecols=[0], dtype=str) + .squeeze() + ) + contigs = ser[ser.apply(lambda x: len(x)) <= 2] + suffixes = expand("-chr{chrom}", chrom=contigs) + + elif typ == "somatic": + suffixes=["_somatic"] + elif typ == "structural_variations": + suffixes=["_structural_variations"] + species = species.lower() + release = int(release) + build = build + typ = typ + + # Prior to release 91, species names started with a capital letter + species_filename = species if release >= 91 else species.capitalize() + + urls = expand( + "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.{ext}", + release=release, + species=species, + suffix=suffixes, + species_filename=species_filename, + branch=branch, + ext=["vcf.gz", "vcf.gz.csi"] + ) + vcfs = [os.path.basename(url) for url in urls if url.endswith(".gz")] + + # Compose a command that downloads all vcf.gz files (possibly one for + # each contig in human Ensembl releases >=93) + gather = "curl {urls}".format(urls=" ".join(map("-O {}".format, urls))) + + # We'll be downloading in a temp dir, so get absolute paths to make things easier + fai = os.path.abspath(input.fai) + log = os.path.abspath(str(log)) + with tempfile.TemporaryDirectory() as tmpdir: + if input.get("fai"): + shell( + "cd {tmpdir}; {gather} 2> {log} " + "&& bcftools concat -Oz --naive-force {vcfs} > concat.vcf.gz 2>> {log}" + ) + shell( + "bcftools reheader --fai {fai} {tmpdir}/concat.vcf.gz> {output} 2>> {log} " + "&& tabix -p vcf {output} 2>> {log} " + ) + +#if config['references']['human'].get('variation'): +# rule dbnsfp: +# """ +# Download and process dbNSFP database. This involves downloading and +# extracting the zip file, then combining the chromosomes to create +# a single file. For genome builds like hg19 and GRCh37, some processing +# needs to be done to make them compatible with dbNSFP version > 3.X +# dbNSFP is only for human genomes. +# """ +# output: +# protected( +# '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.vcf.gz'.format( +# dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'], +# build=config['references']['human']['variation']['dbnsfp']['build'] +# ) +# ) +# log: +# '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.log'.format( +# dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'], +# build=config['references']['human']['variation']['dbnsfp']['build'] +# ) +# resources: +# disk_mb=gb(500), +# mem_mb=gb(500), +# runtime=hours(8) +# threads: 16 +# run: +# version = config['references']['human'][wildcards.tag]['dbnsfp']['version'] +# URL = config['references']['human'][wildcards.tag]['dbnsfp']['url'] +# build = config['references']['human'][wildcards.tag]['dbnsfp']['build'] +# workdir = wildcards.references_dir +# if build == 'GRCh37': +# # We need to process the dbNSFP file to make it compatible with older genomes +# with tempfile.TemporaryDirectory() as tmpdir: +# shell( +# '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && ''' +# '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && ''' +# '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && ''' +# '''awk '$8 != "." ' all_chrs > all_chrs_filtered && ''' +# '''sort -S 50% --parallel=12 all_chrs_filtered -k8,8 -k9,9n > all_chrs_filtered_sorted && ''' +# '''cat h all_chrs_filtered_sorted > all_chrs_filtered_sorted_header && ''' +# '''bgzip -c all_chrs_filtered_sorted_header > {output}) && ''' +# '''tabix -s 8 -b 9 -e 9 {output} ''' +# ) +# if build == 'GRCh38': +# with tempfile.TemporaryDirectory() as tmpdir: +# # No need for processing and we can use the first 2 columns for coordinates +# shell( +# '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && ''' +# '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && ''' +# '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && ''' +# '''sort -S 50% --parallel=24 all_chrs -k1,1 -k2,2n > all_chrs_sorted && ''' +# '''cat h all_chrs_sorted > all_chrs_sorted_header && ''' +# '''bgzip -c all_chrs_sorted_header > {output}) && ''' +# '''tabix -s 1 -b 2 -e 2 {output} ''' +# ) +# + # vim: ft=python diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile new file mode 100644 index 00000000..c6850a37 --- /dev/null +++ b/workflows/variant-calling/Snakefile @@ -0,0 +1,938 @@ +import sys +sys.path.insert(0, srcdir('.')) +import pandas as pd +import tempfile +import os +from os import path +import re +from tempfile import TemporaryDirectory,NamedTemporaryFile +from snakemake.shell import shell +import yaml +from textwrap import dedent +from pathlib import Path +from urllib.request import urlretrieve +from zipfile import ZipFile +sys.path.append('../..') +from lib import common, utils, helpers, aligners +from lib.utils import autobump, gb, hours + +configfile: "config/config.yaml" + +include: '../../lib/helpers.smk' + +aln_index, dbnsfp, dictionary, indexed, known_sites, reference = preflight() + +wildcard_constraints: + vartype="snvs|indels", + sample="|".join(samples.index), + unit="|".join(units["unit"]), + comp="|".join(config['mutect2'].keys()) + + +rule all: + input: + "results/annotated/ann.vcf.gz", + "results/qc/multiqc.html", + "results/filtered/all.normed.vcf.gz", + expand("results/somatic_filtered/normed.{comp}.vcf.gz", comp = config['mutect2'].keys()), + expand("results/mutect2_annotated/snpeff.{comp}.vcf.gz", comp = config['mutect2'].keys()), + + + +checkpoint genome_index: + threads: 2 + resources: + mem_mb=gb(4), + runtime=autobump(60) + input: + reference + output: + indexed + log: + 'logs/fasta_index.log' + shell: + "samtools " + "faidx " + "{input} > {output} 2> {log} " + + +rule fasta_dict: + threads: 1 + resources: + mem_mb=gb(4), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + disk_mb=gb(4), + runtime=autobump(60) + input: + ref=reference + output: dictionary + log: "logs/sequence_dictionary.log" + run: + java_opts = set_java_opts(resources) + shell( + 'picard CreateSequenceDictionary {java_opts} ' + '-R {input.ref} ' + '-O {output} &> {log} ' + ) + + +if not aln_index: + rule bwa_index: + """ + Generate BWA index for the reference genome if we are not using lcdb-wf references workflow + """ + threads: 8 + resources: + disk_mb=gb(24), + mem_mb=gb(24), + runtime=autobump(180) + input: + reference + output: + multiext(reference, ".amb", ".ann", ".bwt", ".pac", ".sa") + log: + "logs/bwa_index.log" + params: + "bwtsw " + shell: + "bwa index -a {params} " + "{input} " + " &> {log}" + + +rule trim_adapters: + threads: 8 + resources: + mem_mb=gb(32), + runtime=autobump(360) + input: unpack(get_fastq), + output: + r1="results/trimmed/{sample}-{unit}.1.fastq.gz", + r2="results/trimmed/{sample}-{unit}.2.fastq.gz", + log: + "logs/{sample}-{unit}_trimming.log" + shell: + 'cutadapt ' + '-o {output.r1} ' + '-p {output.r2} ' + '-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT ' + '--nextseq-trim 20 ' + '--overlap 6 ' + '-j {threads} ' + '--minimum-length 25 ' + '{input.r1} ' + '{input.r2} ' + ' &> {log} ' + + +rule map_reads: + threads: 32 + resources: + disk_mb=gb(40), + mem_mb=gb(48), + runtime=autobump(1920) + input: + reads=["results/trimmed/{sample}-{unit}.1.fastq.gz","results/trimmed/{sample}-{unit}.2.fastq.gz"], + idx=multiext(reference, ".amb", ".ann", ".bwt", ".pac", ".sa"), + output: + bam=temp("results/mapped/{sample}-{unit}.sorted.bam"), + params: + extra=get_read_group, + index=lambda w, input: os.path.splitext(input.idx[0])[0], + log: + "logs/{sample}-{unit}_bwamem.log" + shell: + "bwa mem " + "-t {threads} " + "{params.extra} " + "{params.index} " + "{input.reads} | " + "samtools view -bh | samtools sort -o {output} -O BAM " + "2> {log}" + + +rule mark_duplicates: + """ + If we run bqsr, then we do not need to save the output of mark duplicates, since those bams will + be recalibrated. However, if we don't recalibrate, then we need to save the bams from mark duplicates + and we don't want to mark them as temporary + """ + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(40), + mem_mb=gb(32), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(720) + input: + bam = "results/mapped/{sample}-{unit}.sorted.bam", + output: + metrics="results/qc/picard/markdups/{sample}-{unit}_marked_dup_metrics.txt", + bam=( + temp("results/dedup/{sample}-{unit}.bam") if config['filtering']['bqsr'] + else "results/dedup/{sample}-{unit}.bam" + ) + + log: + "logs/{sample}-{unit}_mark_dup.log" + params: + rm = ('-REMOVE_DUPLICATES true ' + if config['processing']['remove-duplicates'] + else '') + run: + java_opts = set_java_opts(resources) + shell( + 'picard MarkDuplicates ' + '{java_opts} ' + '-INPUT {input.bam} ' + '-OUTPUT {output.bam} ' + '-ASSUME_SORT_ORDER coordinate ' + '{params.rm} ' + '-METRICS_FILE {output.metrics} ' + ' 2> {log} ' + ) + + +if config["filtering"]['bqsr']: + rule base_recalibrator: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + mem_mb=gb(8), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + disk_mb=gb(40), + runtime=autobump(960) + input: + bam=get_recal_input(bai=False), + bai=get_recal_input(bai=True), + ref=reference, + dict=dictionary, + known=known_sites, + known_idx=known_sites + '.tbi' + output: + recal_table="results/recal/{sample}-{unit}.grp" + log: + "logs/{sample}-{unit}_base_recalibrator.log" + run: + java_opts = set_java_opts(resources) + shell( + 'gatk --java-options {java_opts} BaseRecalibrator ' + '-R {input.ref} ' + '-I {input.bam} ' + '-O {output.recal_table} ' + '--known-sites {input.known} 2> {log}' + ) + + + rule apply_bqsr: + threads: 8 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + mem_mb=gb(32), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + disk_mb=gb(40), + runtime=autobump(960) + input: + bam=get_recal_input(bai=False), + bai=get_recal_input(bai=True), + ref=reference, + dict=dictionary, + recal_table="results/recal/{sample}-{unit}.grp", + output: + bam=protected("results/recal/{sample}-{unit}.bam") + log: + "logs/{sample}-{unit}_apply_bsqr.log" + run: + java_opts = set_java_opts(resources) + shell( + 'gatk --java-options {java_opts} ApplyBQSR ' + '-R {input.ref} ' + '-I {input.bam} ' + '--bqsr-recal-file {input.recal_table} ' + '-O {output.bam} 2> {log}' + ) + + +rule build_bam_index: + resources: + mem_mb=gb(2), + disk_mb=gb(2), + runtime=autobump(30) + input: + bam ="{prefix}.bam" + output: + "{prefix}.bam.bai" + run: + basename = os.path.basename(input.bam) + log = 'logs/' + os.path.splitext(basename)[0] + '_buildbamindex.log' + shell("samtools index {input.bam} > {output} 2> {log}") + + +if config["processing"]["restrict-regions"]: + rule compose_regions: + """ + This command will ONLY work if the chromosome nomeclature matches the format in the reference genome + That is, chromosomes DO NOT have the 'chr' prefix. + + .bed files are formatted like so: + + Some bed files have header lines that can start with the word 'browser' or 'track' per UCSC + + To check this, we will read the lines of the .bed file and compare them to what is in the fasta index. + If we encounter any mismatches, then we exit with division by zero error and a print + statement that explains the bed file needs to be edited to make the nomenclature match. + + The awk command in the shell statement prints the entire lines of the input bed + into distinct files that are each named by the first column (chromosome) + Basically, we are splitting the provided .bed file up into contigs. + """ + resources: + disk_mb=1024, + mem_mb=1024, + runtime=20 + input: + bed = config["processing"]["restrict-regions"], + output: + "results/called/{contig}.regions.bed" + log: + "logs/{contig}_compose_regions.log" + run: + # Check for nomenclature mismatch using helper function in helpers.smk + chr_fai = get_fai_nomenclature + chr_bed = get_bed_nomenclature + if chr_fai != chr_bed: + raise ValueError("Nomenclature mismatch detected. Please review the fasta index file and the .bed files being used. The chromosome format MUST match between the .bed file and the reference. Please edit the bed file. For GRCh38 genomes, there should be no 'chr' prefix.") + shell(''' awk '$1 == "{wildcards.contig}" {{print $0 >> (t "/" $1 ".regions.bed" )}}' t=results/called {input} ''') + + +rule call_variants: + input: + bam=get_sample_bams, + ref=reference, + dict=dictionary, + known=known_sites, + tbi=( + known_sites + '.tbi' if known_sites else [] + ), + regions=( + "results/called/{contig}.regions.bed" + if config["processing"]["restrict-regions"] + else [] + ), + output: + gvcf=protected("results/called/{sample}.{contig}.g.vcf.gz"), + log: "logs/{sample}_{contig}_call_variants.log" + resources: + disk_mb=gb(16), + mem_mb=gb(40), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(hours=8) + threads: 8 + # threads: 1 # [ TEST SETTINGS -1 ] + params: + extra=get_call_variants_params, + pcr=( + '--pcr-indel-model ' + config['processing']['pcr'] + if config['processing']['pcr'] + else '' + ) + run: + java_opts = set_java_opts(resources) + known = input.known + if known: + known = "--dbsnp " + str(known) + regions = params.extra + bams = input.bam + if isinstance(bams, str): + bams = [bams] + bams = list(map("-I {}".format, bams)) + shell( + 'gatk --java-options {java_opts} HaplotypeCaller {regions} ' + '-R {input.ref} ' + '{bams} ' + '-ERC GVCF ' + '{params.pcr} ' + '-O {output.gvcf} {known} 2> {log} ' + ) + + +rule combine_calls: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(10), + mem_mb=gb(4), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(720) + input: + ref=reference, + gvcfs=expand("results/called/{sample}.{{contig}}.g.vcf.gz", sample=samples.index), + output: + gvcf="results/called/all.{contig}.g.vcf.gz", + log: "logs/{contig}_combine_calls.log" + run: + java_opts = set_java_opts(resources) + gvcfs=list(map("-V {}".format, input.gvcfs)) + shell( + 'gatk --java-options {java_opts} CombineGVCFs ' + '{gvcfs} ' + '-R {input.ref} ' + '-O {output.gvcf} 2> {log} ' + ) + + +rule genotype_variants: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(10), + mem_mb=gb(8), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + ref=reference, + idx="results/called/all.{contig}.g.vcf.gz.tbi", + gvcf="results/called/all.{contig}.g.vcf.gz", + output: + vcf=temp("results/genotyped/all.{contig}.vcf.gz"), + log: + "logs/genotypegvcfs.{contig}.log", + run: + java_opts = set_java_opts(resources) + shell( + 'gatk --java-options {java_opts} GenotypeGVCFs ' + '-V {input.gvcf} ' + '-R {input.ref} ' + '-O {output.vcf} 2> {log}' + ) + + +rule merge_variants: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(10), + mem_mb=gb(8), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + vcfs=lambda w: expand( + "results/genotyped/all.{contig}.vcf.gz", contig=get_contigs() + ), + output: + vcf="results/genotyped/all.vcf.gz", + log: + "logs/merge-genotyped.log", + run: + inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs) + java_opts = set_java_opts(resources) + shell( + 'picard' + ' MergeVcfs' + ' {java_opts}' + ' {inputs}' + ' -OUTPUT {output}' + ' 2> {log}' + ) + + +rule tabix_variants: + threads: 2 + resources: + disk_mb=gb(2), + mem_mb=gb(2), + runtime=autobump(30) + input: + vcf="{prefix}.vcf.gz", + output: + "{prefix}.vcf.gz.tbi", + run: + basename = os.path.basename(input.vcf) + log = 'logs/' + os.path.splitext(basename)[0] + '_tabix.log' + shell("tabix -p vcf {input.vcf} 2> {log} ") + + +rule select_calls: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(10), + mem_mb=gb(4), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + ref=reference, + vcf="results/genotyped/all.vcf.gz", + output: + vcf=temp("results/filtered/all.{vartype}.vcf.gz"), + log: + "logs/selectvariants_{vartype}.log", + run: + java_opts = set_java_opts(resources) + vartype_arg="--select-type-to-include {}".format( + "SNP" if wildcards.vartype == "snvs" else "INDEL" + ), + shell( + 'gatk --java-options {java_opts} SelectVariants ' + '-R {input.ref} ' + '-V {input.vcf} ' + '{vartype_arg} ' + '-O {output.vcf} 2> {log}' + ) + + +rule hard_filter_calls: + threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] + resources: + disk_mb=gb(10), + mem_mb=gb(4), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + ref=reference, + vcf="results/filtered/all.{vartype}.vcf.gz", + output: + vcf=temp("results/filtered/all.{vartype}.hardfiltered.vcf.gz"), + log: + "logs/variantfiltration_{vartype}.log", + run: + java_opts = set_java_opts(resources) + filter_arg = {'snv_hard_filer' : config['filtering']['hard'][wildcards.vartype]} + filters = [ + "--filter-name {} --filter-expression '{}'".format(name, expr.replace("'", "\\'")) + for name, expr in filter_arg.items() + ] + shell( + 'gatk --java-options {java_opts} VariantFiltration ' + '-R {input.ref} ' + '-V {input.vcf} ' + '{filters} ' + '-O {output.vcf} 2> {log}' + ) + + +rule merge_calls: + threads: 2 + resources: + disk_mb=gb(10), + mem_mb=gb(8), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + vcfs=expand( + "results/filtered/all.{vartype}.{filtertype}.vcf.gz", + vartype=["snvs", "indels"], filtertype='hardfiltered', + ), + output: + vcf=temp("results/filtered/all.final.vcf.gz"), + log: + "logs/merge-filtered.log", + run: + inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs) + java_opts = set_java_opts(resources) + shell( + 'picard ' + ' MergeVcfs' + ' {java_opts}' + ' {inputs}' + ' -OUTPUT {output}' + ' 2> {log}' + ) + + +rule norm: + """ + Split multiallielic variants into multiple biallelic ones. + """ + resources: + mem_mb=gb(16), + runtime=autobump(120) + input: + ref=reference, + vcf="results/filtered/all.final.vcf.gz" + output: + "results/filtered/all.normed.vcf.gz" + log: + "logs/norm-vcf.log" + shell: + "bcftools norm -f {input.ref} " + "-m- " + "{input.vcf} " + "--output-type z " + "--output {output} 2> {log}" + + +rule fastqc: + resources: + mem_mb=gb(12), + runtime=autobump(120), + threads: 8 + input: + unpack(get_fastq), + output: + html="results/qc/fastqc/data/{sample}-{unit}_fastqc.html", + zip="results/qc/fastqc/zip/{sample}-{unit}_fastqc.zip" + log: + "logs/{sample}-{unit}_fastqc.log" + run: + def base_file(file_path): + baseName = Path(path.basename(file_path)) + while baseName.suffix in {'.gz','.bz2','.txt','.fastq','.fq','.sam','.bam'}: + baseName = baseName.with_suffix('') + return str(baseName) + with TemporaryDirectory() as tempdir: + shell( + "fastqc " + "--threads {threads} " + "--noextract " + "--quiet " + "--outdir {tempdir:q} " + "{input:q} " + "&> {log} " + ) + output_base = base_file(input[0]) + html_path = path.join(tempdir, output_base + "_fastqc.html") + zip_path = path.join(tempdir, output_base + "_fastqc.zip") + if output.html != html_path: + shell("mv {html_path:q} {output.html:q}") + if output.zip != zip_path: + shell("mv {zip_path:q} {output.zip:q}") + + +rule samtools_stats: + """ + Run samtools stats + """ + resources: + mem_mb=gb(16), + runtime=autobump(120) + input: + get_sample_unit_bams + output: + "results/qc/samtools-stats/{sample}-{unit}.txt" + log: + "logs/samtools-stats_{sample}-{unit}.log" + shell: + "samtools stats {input} 1> {output} 2> {log} " + + +snpeff_input_for_multiqc = [] +if config['snpeff']['germline']: + snpeff_input_for_multiqc.append('results/qc/snpEff_summary.csv') +if config['snpeff']['somatic']: + soms = expand('results/qc/snpEff_{comp}_summary.csv', comp = config['mutect2'].keys()) + snpeff_input_for_multiqc.extend(soms) + + + +rule multiqc: + """ + Gather qc metrics and run MultiQC + Get the html output from somatic and germline VCF annotation if specified in the config. + """ + resources: + mem_mb=gb(4), + runtime=autobump(60) + input: + fastqc=expand("results/qc/fastqc/zip/{u.sample}-{u.unit}_fastqc.zip", u=units.itertuples()), + markdup=expand("results/qc/picard/markdups/{u.sample}-{u.unit}_marked_dup_metrics.txt", u=units.itertuples()), + samstats=expand("results/qc/samtools-stats/{u.sample}-{u.unit}.txt", u=units.itertuples()), + snpeff=snpeff_input_for_multiqc + output: + "results/qc/multiqc.html", + params: + dirname="results/qc/", + name="multiqc.html", + log: + "logs/multiqc.log", + run: + input_dirs=params.dirname + shell( + "multiqc " + "--force " + "-o {params.dirname} " + "-n {params.name} " + "{input_dirs} " + " &> {log} " + ) + + +rule snpeff: + """ + Annotate variants with SnpEff + """ + resources: + disk_mb=gb(20), + mem_mb=gb(16), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + unpack(snpeff_input) + log: 'logs/snpeff.log' + output: + ann='results/annotated/ann.vcf.gz', + stats='results/qc/snpEff_summary.csv', + html='results/qc/snpEff_summary.html' + params: + annotations = config['snpeff']['annotations'], + gen = config['snpeff']['genome'] + # threads: 2 # [ TEST SETTINGS ] + run: + java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024)) + shell( + "snpEff {java_opts} " + "-o vcf " + "-csvStats {output.stats} " + "-stats {output.html} " + "{params.gen} {input.vcf} " + "| bcftools view -Oz > {output.ann} 2> {log} " + ) + dbnsfp_arg = [] + if dbnsfp: + dbnsfp_arg = "DbNsfp -db {}".format(dbnsfp) + if dbnsfp_arg: + sift_output = 'results/annotated/dbnsfp.ann.vcf.gz' + field_arg = ( + "-f '{}'".format(params.annotations) + if params.annotations + else '' + ) + + shell( + "SnpSift {java_opts} " + "{dbnsfp_arg} " + "{field_arg} {output.ann} " + "| bcftools view -Oz > {sift_output} 2>> {log} " + ) + + +rule mutect2: + """ + Use Mutect2 to call variants on individual samples, one per contig + """ + resources: + disk_mb=gb(40), + mem_mb=gb(32), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(720) + input: + unpack(input_for_somatic) + output: + vcf="results/mutect2_called/raw.{comp}.{contig}.vcf.gz", + stats='results/mutect2_called/raw.{comp}.{contig}.vcf.gz.stats', + orientation='results/lrom/{contig}_{comp}.tar.gz' + log: + "logs/{comp}_{contig}_mutect2_call_variants.log" + params: + pon = ( + '--panel-of-normals ' + config['mutect2']['PON'] + if config['PON'] + else [] + ), + extra=get_call_variants_params, + pcr=( + '--pcr-indel-model ' + config['processing']['pcr'] + if config['processing']['pcr'] + else '' + ) + # threads: 1 # [ TEST SETTINGS ] + run: + java_opts = set_java_opts(resources) + normals = " ".join("-I {} ".format(n) for n in input.normals) + tumors = " ".join("-I {} ".format(t) for t in input.tumors) + names = names_for_somatic(wildcards) + formatted_names = " ".join('-normal {} '.format(name) for name in names) + shell( + "gatk Mutect2 " + "--java-options {java_opts} " + "-R {input.ref} " + "{normals} " + "{tumors} " + "{params.extra} " + "{formatted_names} " + "{params.pcr} " + "--f1r2-tar-gz {output.orientation} " + "{params.pon} " + "-O {output.vcf} 2> {log}" + ) + + +rule lrom: + """ + Run LearnReadOrientationModel to get the maximum likelihood estimates of artifact prior probabilities + in the orientation bias mixture model filter + """ + resources: + disk_mb=gb(20), + mem_mb=gb(32), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + orientation=lambda w: expand('results/lrom/{contig}_{{comp}}.tar.gz', contig=get_contigs()) + output: + lrom='results/lrom/artifact-prior-{comp}.tar.gz' + log: + 'logs/lrom_{comp}.log' + # threads: 1 # [ TEST SETTINGS ] + run: + java_opts = set_java_opts(resources) + def get_format_lrom(): + names= ['-I {} '.format(i) for i in input.orientation] + names = ' '.join(names) + return names + lrom_names = get_format_lrom() + + shell( + 'gatk --java-options {java_opts} LearnReadOrientationModel {lrom_names} ' + '-O {output.lrom} &> {log}' + ) + + +rule merge_mutect2_variants: + """ + After individual contigs are called via mutect2, we merge them together here. + """ + resources: + disk_mb=gb(20), + mem_mb=gb(32), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + vcfs=lambda w: expand( + "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz", contig=get_contigs() + ), + output: + temp("results/somatic/merged.{comp}.vcf.gz") + log: + "logs/merge_mutect2.{comp}.log", + # threads: 1 # [ TEST SETTINGS ] + run: + inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs) + java_opts = set_java_opts(resources) + shell( + 'picard' + ' MergeVcfs' + ' {java_opts}' + ' {inputs}' + ' -OUTPUT {output}' + ' &> {log}' + ) + + +rule merge_mutect2_stats: + """ + Just like merging VCFs for Mutect2, we also need to merge stats for filtering. + """ + resources: + disk_mb=gb(20), + mem_mb=gb(16), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + stats=lambda w: expand( + "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz.stats", contig=get_contigs() + ), + output: + temp("results/somatic/merged.{comp}.vcf.gz.stats") + log: + "logs/merge_mutect2_stats.{comp}.log" + # threads: 1 # [ TEST SETTINGS ] + run: + java_opts = set_java_opts(resources) + inputs = " ".join(" -stats {} ".format(f) for f in input.stats) + shell( + "gatk MergeMutectStats " + "--java-options {java_opts} " + "{inputs} " + "-O {output} " + "&> {log}" + ) + + +rule filter_mutect2_calls: + """ + New versions of Mutect2 have optimized defaults for filtering; we can just use those. + """ + resources: + disk_mb=gb(20), + mem_mb=gb(16), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + ref=reference, + unfiltered="results/somatic/merged.{comp}.vcf.gz", + stats="results/somatic/merged.{comp}.vcf.gz.stats", + lrom='results/lrom/artifact-prior-{comp}.tar.gz' + output: + "results/somatic_filtered/filtered.{comp}.vcf.gz" + log: + "logs/{comp}.vcf.gz.log" + # threads: 1 # [ TEST SETTINGS ] + run: + java_opts = set_java_opts(resources) + shell( + "gatk FilterMutectCalls " + "--java-options {java_opts} " + "-stats {input.stats} " + "--orientation-bias-artifact-priors {input.lrom} " + "-R {input.ref} " + "-V {input.unfiltered} " + "-O {output}" + ) + + +rule mutect2_norm: + """ + Split multiallielic variants into multiple biallelic ones. + """ + resources: + mem_mb=gb(16), + runtime=autobump(120) + input: + ref=reference, + vcf="results/somatic_filtered/filtered.{comp}.vcf.gz" + output: + "results/somatic_filtered/normed.{comp}.vcf.gz" + log: + "logs/norm-{comp}-vcf.log" + shell: + "bcftools norm -f {input.ref} " + "-m- " + "{input.vcf} " + "--output-type z " + "--output {output} 2> {log}" + + +rule snpeff_cancer: + """ + Annotate somatic variants with SnpEff Cancer + """ + resources: + disk_mb=gb(20), + mem_mb=gb(16), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + unpack(snpeff_cancer_input), + output: + vcf='results/mutect2_annotated/snpeff.{comp}.vcf.gz', + stats='results/qc/snpEff_{comp}_summary.csv', + html='results/qc/snpEff_{comp}.html' + log: + 'logs/cancer_snpeff_{comp}.log' + params: + snpeff_genome = config['snpeff']['genome'] + # threads: 2 # [ TEST SETTINGS ] + run: + java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024)) + shell( + 'snpEff {java_opts} ' + '-v -o vcf -cancer ' + '-csvStats {output.stats} ' + '-stats {output.html} ' + '{params.snpeff_genome} {input.vcf} ' + '| bcftools view -Oz > {output.vcf} 2> {log}' + ) + +# vim: ft=python diff --git a/workflows/variant-calling/config/config.yaml b/workflows/variant-calling/config/config.yaml new file mode 100644 index 00000000..3b2ce530 --- /dev/null +++ b/workflows/variant-calling/config/config.yaml @@ -0,0 +1,79 @@ +samples: config/samples.tsv +units: config/units.tsv +ref: + # Set to true only if you want your references to come from lcdb-wf references + use_references_workflow: false + # Match these to the reference config that is included at the bottom of this file + # Only configure this section if use_references_workflow is set to true + organism: 'human' + genome: + tag: 'ensembl-104' + build: 'GRCh38' + aligner: + index: 'bwa' + tag: 'ensembl-104' + faidx: + index: 'faidx' + tag: 'ensembl-104' + variation: + # Fill these keys in with the name of the variation database that matches the value in the reference config + # Or alternatively, you can provide an ABSOLUTE path to these files locally (paths MUST start with '/') + # If this is the case, you should go edit the lcdb-wf references config to make sure these jobs are not run for no reason. + known: 'known' + dbnsfp: 'dbNSFPv4.4' + # If you are providing your own references, include their paths here. + paths: + # When using BWA, you should not use a top level genome assembly for human, see http://lh3.github.io/2017/11/13/which-human-reference-genome-to-use + ref: 'references/GRCh38.6.20.fa.gz' + known: 'references/known_variation_noiupac.vcf.gz' + index: + dbnsfp: 'references/dbnsfp_6_20.vcf.gz' +processing: + remove-duplicates: true + remove-mitochondrial: true + # See https://gatk.broadinstitute.org/hc/en-us/articles/360036465912-HaplotypeCaller#--pcr-indel-model for pcr + # If you know there was no PCR used to generate your sequencing data, set this to NONE + pcr: + # Point to a bed file, e.g. captured regions + restrict-regions: + #'references/exons_subset.bed' + # If regions are restricted, optionally enlarge them by a given value + region-padding: +filtering: + # Set to true in order to apply machine learning based recalibration of + # quality scores instead of hard filtering. + bqsr: true + hard: + # hard filtering as outlined in GATK docs + # (https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set) + snvs: + "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" + indels: + "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" +snpeff: + # MultiQC rule needs these set to collect summary files from the snpeff and snpeff_cancer rule + # Set each to true respectively if you plan on generating annotations for somatic or germline data + somatic: true + germline: true + # Run snpEff databases to see available databases (https://pcingola.github.io/SnpEff/se_commandline/) + # See https://pcingola.github.io/SnpEff/se_build_db/ for docs on building your own database + genome: 'GRCh38.p14' + # Add annotations in the form of a comma-separated string to attach from dbnsfp for snpsift. + # These annotations should be column names in the dbnsfp file. + # Leave this blank if you are not using dbnsfp. + annotations: 'FATHMM_pred,SIFT_pred' +# Supple a panel of normals file if you have one for your genome. It is OK to leave this blank. +# See https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- for details on the PON file. +PON: +mutect2: + # See the docs on how to configure this section + tumor-normal: + tumor: + - 'tumor' + normal: + - 'normal' + + + +include_references: + - '../../include/reference_configs/variant-calling.yaml' diff --git a/workflows/variant-calling/config/samples.tsv b/workflows/variant-calling/config/samples.tsv new file mode 100644 index 00000000..21c6b82e --- /dev/null +++ b/workflows/variant-calling/config/samples.tsv @@ -0,0 +1,3 @@ +sample +tumor +normal diff --git a/workflows/variant-calling/config/units.tsv b/workflows/variant-calling/config/units.tsv new file mode 100644 index 00000000..d336d4be --- /dev/null +++ b/workflows/variant-calling/config/units.tsv @@ -0,0 +1,3 @@ +sample unit platform fq1 fq2 +tumor 1 Illumina data/example_data/tumor_R1.fq.gz data/example_data/tumor_R2.fq.gz +normal 1 Illumina data/example_data/normal_R1.fq.gz data/example_data/normal_R2.fq.gz diff --git a/workflows/variant-calling/run_test.sh b/workflows/variant-calling/run_test.sh new file mode 100755 index 00000000..7aacb413 --- /dev/null +++ b/workflows/variant-calling/run_test.sh @@ -0,0 +1,3 @@ +set -e +python -m doctest ../../ci/preprocessor.py +python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test "$@"