From 4d1222c72a25a6d84f8f27543742d4fddbf51e3b Mon Sep 17 00:00:00 2001 From: fridellsa Date: Thu, 1 Jun 2023 18:15:22 -0400 Subject: [PATCH 01/32] Initial variant calling workflow --- env.yml | 204 ++-- .../reference_configs/variant-calling.yaml | 30 + include/requirements.txt | 11 +- lib/aligners.py | 25 + lib/common.py | 34 +- lib/helpers.smk | 335 +++++++ workflows/references/Snakefile | 185 +++- workflows/references/config/config.yaml | 6 +- workflows/variant-calling/Snakefile | 911 ++++++++++++++++++ workflows/variant-calling/config/config.yaml | 74 ++ workflows/variant-calling/config/samples.tsv | 3 + workflows/variant-calling/config/units.tsv | 4 + 12 files changed, 1707 insertions(+), 115 deletions(-) create mode 100644 include/reference_configs/variant-calling.yaml create mode 100644 lib/helpers.smk create mode 100644 workflows/variant-calling/Snakefile create mode 100644 workflows/variant-calling/config/config.yaml create mode 100644 workflows/variant-calling/config/samples.tsv create mode 100644 workflows/variant-calling/config/units.tsv diff --git a/env.yml b/env.yml index 10696a3a6..ee82a16ed 100644 --- a/env.yml +++ b/env.yml @@ -1,4 +1,4 @@ -name: null +name: /gpfs/gsfs10/users/NICHD-core0/test/fridellsa/v1.10-lcdbwf-vc/lcdb-wf/env channels: - conda-forge - bioconda @@ -6,7 +6,7 @@ dependencies: - _libgcc_mutex=0.1 - _openmp_mutex=4.5 - _r-mutex=1.0.1 - - alsa-lib=1.2.3.2 + - alsa-lib=1.2.8 - amply=0.1.5 - appdirs=1.4.4 - argcomplete=3.0.8 @@ -17,20 +17,20 @@ dependencies: - backcall=0.2.0 - backports=1.0 - backports.functools_lru_cache=1.6.4 + - bcftools=1.17 - bedtools=2.31.0 - - binutils_impl_linux-64=2.39 - - binutils_linux-64=2.39 + - binutils_impl_linux-64=2.40 - biopython=1.81 - - boost-cpp=1.74.0 + - boost-cpp=1.78.0 - bowtie=1.3.1 - bowtie2=2.5.1 - brotli=1.0.9 - brotli-bin=1.0.9 - - brotlipy=0.7.0 + - bwa=0.7.17 - bwidget=1.9.14 - bx-python=0.9.0 - bzip2=1.0.8 - - c-ares=1.18.1 + - c-ares=1.19.1 - ca-certificates=2023.5.7 - cairo=1.16.0 - certifi=2023.5.7 @@ -49,8 +49,7 @@ dependencies: - configargparse=1.5.3 - connection_pool=0.0.3 - contourpy=1.0.7 - - cryptography=39.0.0 - - curl=7.86.0 + - curl=7.87.0 - cutadapt=4.4 - cycler=0.11.0 - datrie=0.8.2 @@ -59,8 +58,8 @@ dependencies: - deeptools=3.5.2 - deeptoolsintervals=0.1.9 - dnaio=0.10.0 - - docutils=0.20 - - dpath=2.1.5 + - docutils=0.20.1 + - dpath=2.1.6 - exceptiongroup=1.1.1 - execnet=1.9.0 - executing=1.2.0 @@ -81,30 +80,30 @@ dependencies: - fribidi=1.0.10 - future=0.18.3 - gat=1.3.6 - - gcc_impl_linux-64=10.4.0 - - gcc_linux-64=10.4.0 + - gatk4=4.4.0.0 + - gcc_impl_linux-64=12.2.0 - gettext=0.21.1 - gffread=0.12.7 - gffutils=0.11.1 - - gfortran_impl_linux-64=10.4.0 - - gfortran_linux-64=10.4.0 + - gfortran_impl_linux-64=12.2.0 - giflib=5.2.1 - gitdb=4.0.10 - gitpython=3.1.31 - - glib=2.74.1 - - glib-tools=2.74.1 + - glib=2.76.3 + - glib-tools=2.76.3 + - gmp=6.2.1 - graphite2=1.3.13 - gsl=2.7 - - gst-plugins-base=1.18.5 - - gstreamer=1.20.3 - - gxx_impl_linux-64=10.4.0 - - gxx_linux-64=10.4.0 - - harfbuzz=4.2.0 + - gst-plugins-base=1.21.3 + - gstreamer=1.21.3 + - gstreamer-orc=0.4.33 + - gxx_impl_linux-64=12.2.0 + - harfbuzz=6.0.0 - hdf5=1.12.1 - hisat2=2.2.1 - - htslib=1.16 + - htslib=1.17 - humanfriendly=10.0 - - icu=69.1 + - icu=70.1 - idna=3.4 - importlib-metadata=6.6.0 - importlib_resources=5.12.0 @@ -112,7 +111,7 @@ dependencies: - intervalstats=1.01 - ipython=8.13.2 - isa-l=2.30.0 - - jack=1.9.18 + - jack=1.9.22 - jedi=0.18.2 - jinja2=3.1.2 - jpeg=9e @@ -122,19 +121,21 @@ dependencies: - kernel-headers_linux-64=2.6.32 - keyutils=1.6.1 - kiwisolver=1.4.4 - - krb5=1.19.3 + - krb5=1.20.1 + - lame=3.100 - lcms2=2.14 - - ld_impl_linux-64=2.39 + - ld_impl_linux-64=2.40 - lerc=4.0.0 - libblas=3.9.0 - libbrotlicommon=1.0.9 - libbrotlidec=1.0.9 - libbrotlienc=1.0.9 - - libcap=2.64 + - libcap=2.66 - libcblas=3.9.0 - - libclang=13.0.1 + - libclang=15.0.7 + - libclang13=15.0.7 - libcups=2.3.3 - - libcurl=7.86.0 + - libcurl=7.87.0 - libdb=6.2.32 - libdeflate=1.13 - libedit=3.1.20191231 @@ -142,33 +143,36 @@ dependencies: - libevent=2.1.10 - libexpat=2.5.0 - libffi=3.4.2 - - libflac=1.3.4 - - libgcc-devel_linux-64=10.4.0 + - libflac=1.4.2 + - libgcc-devel_linux-64=12.2.0 - libgcc-ng=12.2.0 + - libgcrypt=1.10.1 - libgd=2.3.3 - libgfortran-ng=12.2.0 - libgfortran5=12.2.0 - - libglib=2.74.1 + - libglib=2.76.3 - libgomp=12.2.0 - - libhwloc=2.8.0 + - libgpg-error=1.46 + - libhwloc=2.9.1 - libiconv=1.17 - libjemalloc=5.3.0 - liblapack=3.9.0 - liblapacke=3.9.0 - - libllvm13=13.0.1 + - libllvm15=15.0.7 - libnghttp2=1.51.0 - libnsl=2.0.0 - libogg=1.3.4 - libopenblas=0.3.21 - libopus=1.3.1 - libpng=1.6.39 - - libpq=14.5 - - libsanitizer=10.4.0 - - libsndfile=1.0.31 - - libsqlite=3.41.2 + - libpq=15.1 + - libsanitizer=12.2.0 + - libsndfile=1.2.0 + - libsqlite=3.42.0 - libssh2=1.10.0 - - libstdcxx-devel_linux-64=10.4.0 + - libstdcxx-devel_linux-64=12.2.0 - libstdcxx-ng=12.2.0 + - libsystemd0=252 - libtiff=4.4.0 - libtool=2.4.7 - libudev1=253 @@ -177,9 +181,10 @@ dependencies: - libwebp=1.2.4 - libwebp-base=1.2.4 - libxcb=1.13 - - libxkbcommon=1.0.3 - - libxml2=2.9.14 + - libxkbcommon=1.5.0 + - libxml2=2.10.3 - libzlib=1.2.13 + - lz4-c=1.9.4 - lzo=2.10 - lzstring=1.0.4 - make=4.3 @@ -190,65 +195,33 @@ dependencies: - matplotlib-base=3.7.1 - matplotlib-inline=0.1.6 - mdurl=0.1.0 + - mpg123=1.31.3 - multiqc=1.14 - munkres=1.1.4 - mysql-common=8.0.32 - mysql-connector-c=6.1.11 - mysql-libs=8.0.32 - nbformat=5.8.0 - - ncbi-vdb=3.0.2 - ncurses=6.3 - networkx=3.1 - nspr=4.35 - nss=3.89 - numpy=1.23.5 - - openjdk=11.0.1 + - openjdk=17.0.3 - openjpeg=2.5.0 - openssl=1.1.1t - - ossuuid=1.6.2 - packaging=23.1 - pandas=2.0.1 - pandoc=3.1.2 - - pango=1.50.7 + - pango=1.50.14 - parso=0.8.3 - patsy=0.5.3 - pbzip2=1.1.13 - - pcre2=10.37 + - pcre2=10.40 - perl=5.32.1 - - perl-alien-build=2.48 - - perl-alien-libxml2=0.17 - - perl-business-isbn=3.007 - - perl-business-isbn-data=20210112.006 - - perl-capture-tiny=0.48 - - perl-carp=1.50 - - perl-constant=1.33 - - perl-data-dumper=2.183 - - perl-encode=3.19 - - perl-exporter=5.74 - - perl-extutils-makemaker=7.70 - - perl-ffi-checklib=0.28 - - perl-file-chdir=0.1011 - - perl-file-path=2.18 - - perl-file-temp=0.2304 - - perl-file-which=1.24 - perl-gd=2.76 - perl-gdgraph=1.54 - perl-gdtextutil=0.86 - - perl-importer=0.026 - - perl-mime-base64=3.16 - - perl-parent=0.241 - - perl-path-tiny=0.124 - - perl-pathtools=3.75 - - perl-scope-guard=0.21 - - perl-storable=3.15 - - perl-sub-info=0.002 - - perl-term-table=0.016 - - perl-test2-suite=0.000145 - - perl-uri=5.12 - - perl-xml-libxml=2.0207 - - perl-xml-namespacesupport=1.12 - - perl-xml-sax=1.02 - - perl-xml-sax-base=1.09 - pexpect=4.8.0 - picard=2.27.5 - pickleshare=0.7.5 @@ -261,6 +234,7 @@ dependencies: - platformdirs=3.5.1 - plotly=5.14.1 - pluggy=1.0.0 + - ply=3.11 - pooch=1.7.0 - preseq=3.2.0 - prompt-toolkit=3.0.38 @@ -269,7 +243,7 @@ dependencies: - pthread-stubs=0.4 - ptyprocess=0.7.0 - pulp=2.7.0 - - pulseaudio=14.0 + - pulseaudio=16.1 - pure_eval=0.2.2 - py2bit=0.3.0 - pybedtools=0.9.0 @@ -277,57 +251,59 @@ dependencies: - pycparser=2.21 - pyfaidx=0.7.2.1 - pygments=2.15.1 - - pyopenssl=23.1.1 - pyparsing=3.0.9 - - pyqt=5.15.4 - - pyqt5-sip=12.9.0 + - pyqt=5.15.7 + - pyqt5-sip=12.11.0 - pyrsistent=0.19.3 - - pysam=0.20.0 + - pysam=0.21.0 - pysocks=1.7.1 - pytest=7.3.1 - - pytest-xdist=3.2.1 + - pytest-xdist=3.3.1 - python=3.10.8 - python-dateutil=2.8.2 - - python-fastjsonschema=2.16.3 + - python-fastjsonschema=2.17.1 - python-isal=1.1.0 - - python-lzo=1.14 + - python-lzo=1.15 - python-tzdata=2023.3 - python_abi=3.10 - pytz=2023.3 - pyvcf3=1.0.3 - pyyaml=6.0 - - qt-main=5.15.2 - - r-base=4.1.3 + - qt-main=5.15.6 + - r-base=4.2.2 - readline=8.2 - - requests=2.29.0 + - requests=2.31.0 - reretry=0.11.8 - rich=13.3.5 - rich-click=1.6.1 - rseqc=5.0.1 + - rust-bio-tools=0.42.0 - salmon=1.10.1 - - samtools=1.16.1 + - samtools=1.17 - scipy=1.10.1 - seaborn=0.12.2 - seaborn-base=0.12.2 - sed=4.8 - setuptools=67.7.2 - simplejson=3.19.1 - - sip=6.5.1 + - sip=6.7.9 - six=1.16.0 - smart_open=6.3.0 - smmap=3.0.5 - - snakemake-minimal=7.25.3 + - snakemake-minimal=7.26.0 + - snpeff=5.1 + - snpsift=5.1 - spectra=0.0.11 - - sqlite=3.41.2 - - sra-tools=3.0.3 + - sra-tools=2.9.6 - stack_data=0.6.2 - star=2.7.10b + - starcode=1.4 - statsmodels=0.14.0 - stopit=1.1.2 - subread=2.0.3 - sysroot_linux-64=2.12 - tabulate=0.9.0 - - tbb=2021.7.0 + - tbb=2021.9.0 - tenacity=8.2.2 - throttler=1.2.1 - tk=8.6.12 @@ -335,38 +311,50 @@ dependencies: - toml=0.10.2 - tomli=2.0.1 - toposort=1.10 - - tornado=6.3 + - tornado=6.3.2 - trackhub=0.2.4 - traitlets=5.9.0 - - typing-extensions=4.5.0 - - typing_extensions=4.5.0 + - typing-extensions=4.6.1 + - typing_extensions=4.6.1 - tzdata=2023c - - ucsc-bedgraphtobigwig=377 + - ucsc-bedgraphtobigwig=445 - ucsc-bedsort=377 - - ucsc-bedtobigbed=377 + - ucsc-bedtobigbed=447 - ucsc-bigwigmerge=377 - ucsc-fetchchromsizes=377 - - ucsc-genepredtobed=377 - - ucsc-gtftogenepred=377 - - ucsc-liftover=377 + - ucsc-genepredtobed=447 + - ucsc-gtftogenepred=447 + - ucsc-liftover=447 - ucsc-oligomatch=377 - - ucsc-twobittofa=377 - - ucsc-wigtobigwig=377 + - ucsc-twobittofa=447 + - ucsc-wigtobigwig=447 - unicodedata2=15.0.0 - - urllib3=1.26.15 + - urllib3=2.0.2 - wcwidth=0.2.6 - wheel=0.40.0 - wrapt=1.15.0 + - xcb-util=0.4.0 + - xcb-util-image=0.4.0 + - xcb-util-keysyms=0.4.0 + - xcb-util-renderutil=0.3.9 + - xcb-util-wm=0.4.1 + - xkeyboard-config=2.38 - xopen=1.7.0 + - xorg-fixesproto=5.0 + - xorg-inputproto=2.3.2 - xorg-kbproto=1.0.7 - xorg-libice=1.0.10 - xorg-libsm=1.2.3 - xorg-libx11=1.8.4 - - xorg-libxau=1.0.9 + - xorg-libxau=1.0.11 - xorg-libxdmcp=1.1.3 - xorg-libxext=1.3.4 + - xorg-libxfixes=5.0.3 + - xorg-libxi=1.7.10 - xorg-libxrender=0.9.10 - xorg-libxt=1.2.1 + - xorg-libxtst=1.2.3 + - xorg-recordproto=1.14.2 - xorg-renderproto=0.11.1 - xorg-xextproto=7.3.0 - xorg-xproto=7.0.31 @@ -377,4 +365,4 @@ dependencies: - zlib=1.2.13 - zstandard=0.19.0 - zstd=1.5.2 -prefix: /gpfs/gsfs10/users/NICHD-core0/test/dalerr/lcdb-wf/env +prefix: /gpfs/gsfs10/users/NICHD-core0/test/fridellsa/v1.10-lcdbwf-vc/lcdb-wf/env diff --git a/include/reference_configs/variant-calling.yaml b/include/reference_configs/variant-calling.yaml new file mode 100644 index 000000000..6ad0f9b48 --- /dev/null +++ b/include/reference_configs/variant-calling.yaml @@ -0,0 +1,30 @@ +references: + human: + ensembl-104: + metadata: + build: 'GRCh38' + release: 104 + species: 'homo_sapiens' + genome: + url: 'ftp://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' + # URL format is 'ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species_capitalized}.{build}.{datatype}.{assembly}.{suffix}' + # When using GRCh37, branch changes to "grch37/release-{release}" + # always use primary_assembly for human, NEVER use top_level for assembly for human + indexes: + - 'bwa' + - 'faidx' + known: + # You can download structural_variations, somatic, or "all" which corresponds to germline known variation for all chromosomes + type: 'all' + # Comment the variation key out if not requiring dbnsfp + #variation: + # Download of variation databases will be handled by a unique rule in the Snakefile + # ONLY include keys like 'dbnsfp' IF you intend to download them, comment out these keys if not. + #dbnsfp: + # The version of the database should be correctly formatted like this + #version: 'dbNSFPv4.4' + # The url is found by copying the link address of the latest version found here: https://sites.google.com/site/jpopgen/dbNSFP + #url: 'https://usf.box.com/shared/static/bvfzmkpgtphvbmmrvb2iyl2jl21o49kc' + # Match the build to the metadata block above + #build: 'GRCh38' + diff --git a/include/requirements.txt b/include/requirements.txt index 5f1b4a8e3..188dbe289 100644 --- a/include/requirements.txt +++ b/include/requirements.txt @@ -1,13 +1,17 @@ +bcftools>=1.15.1 bedtools biopython bowtie bowtie2 -cutadapt +bwa +curl +cutadapt>=3.0 deeptools fastq-screen fastqc font-ttf-dejavu-sans-mono gat +gatk4 gffread gffutils hisat2 @@ -27,14 +31,17 @@ pyfaidx pysam pytest pytest-xdist +python>=3.10 rseqc +rust-bio-tools # earlier versions of salmon can segfault on Slurm salmon>=1.10.1 - samtools seaborn snakemake-minimal +snpeff +snpsift sra-tools star subread diff --git a/lib/aligners.py b/lib/aligners.py index 62fe58a57..629a3eb17 100644 --- a/lib/aligners.py +++ b/lib/aligners.py @@ -83,3 +83,28 @@ def fastq_arg_from_input(fastqs): fastqs = '-1 {0} -2 {1} '.format(*fastqs) return fastqs +def bwa_index_from_prefix(prefix): + """ + Given a prefix, return a list of the corresponding bwa index files + """ + ext_list = ["amb", "ann", "bwt", "pac", "sa"] + return ['{prefix}.{ext}'.format(prefix=prefix, ext=ext_list[i]) for i in range(len(ext_list))] + +def bwa_prefix_from_index(index_files): + """ + Given a list of index files, return the corresponding prefix + """ + if isinstance(index_files, str): + return '.'.join(index_files.split('.')[:-1]) + else: + prefixes = list( + set( + map( + lambda x: '.'.join(x.split('.')[:-1]), index_files) + ) + ) + if len(prefixes) != 1: + raise ValueError( + "More than one prefix detected from '{0}'".format(prefixes) + ) + return prefixes[0] diff --git a/lib/common.py b/lib/common.py index 829cc1298..653dc2f6c 100644 --- a/lib/common.py +++ b/lib/common.py @@ -419,6 +419,9 @@ def references_dict(config): 'bowtie2': aligners.bowtie2_index_from_prefix('')[0], 'hisat2': aligners.hisat2_index_from_prefix('')[0], 'star': '/Genome', + # Add BWA and samtools faidx indices + 'bwa': aligners.bwa_index_from_prefix('')[0], + 'faidx': '.fai', # Notes on salmon indexing: # - pre-1.0 versions had hash.bin @@ -451,13 +454,40 @@ def references_dict(config): type_extensions = { 'genome': 'fasta', 'annotation': 'gtf', - 'transcriptome': 'fasta' + 'transcriptome': 'fasta', + 'known': 'vcf.gz' } for organism in merged_references.keys(): d[organism] = {} for tag in merged_references[organism].keys(): e = {} + # add support for variation databases + if tag == 'variation': + # get the variation databases + # they should be the the keys of a dictionary containing a URL and postprocess block + for type_ in merged_references[organism][tag].keys(): + ext = '.vcf.gz' + if type_ == 'dbnsfp': + type_ = merged_references[organism][tag][type_]['version'] + '_' + merged_references[organism][tag][type_]['build'] + e[type_] = ( + '{references_dir}/' + '{organism}/' + '{tag}/' + '{type_}/' + '{organism}_{tag}{ext}'.format(**locals()) + ) + d[organism][tag] = e + continue + e[type_] = ( + '{references_dir}/' + '{organism}/' + '{tag}/' + '{type_}/' + '{organism}_{tag}{ext}'.format(**locals()) + ) + d[organism][tag] = e + continue for type_, block in merged_references[organism][tag].items(): if type_ == 'metadata': continue @@ -546,6 +576,7 @@ def references_dict(config): '{type_}/' '{organism}_{tag}.chromsizes'.format(**locals()) ) + d[organism][tag] = e return d, conversion_kwargs @@ -912,3 +943,4 @@ def gff2gtf(gff, gtf): shell('gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}') else: shell('gffread {gff} -T -o- | gzip -c > {gtf}') + diff --git a/lib/helpers.smk b/lib/helpers.smk new file mode 100644 index 000000000..63e717932 --- /dev/null +++ b/lib/helpers.smk @@ -0,0 +1,335 @@ +import pandas as pd +import yaml +import os + +# Read sample table +samples = pd.read_table(config["samples"], dtype=str).set_index("sample", drop=False) +units = pd.read_table(config["units"], dtype=str).set_index(["sample","unit"], drop=False) +units.index = units.index.set_levels([i for i in units.index.levels]) + +def preflight(): + """ + This helper function gets called at the top of the main Snakefile. It + handles reading the config to see if references are provided externally, or + if we are relying on lcdb-wf references. Returns variables containing + filepaths of references to be used in rules. It will also perform some + checks to make sure the config is not contradicting itself under certain + configurations. + """ + aln_index = [] + dbnsfp = [] + dictionary = [] + indexed = [] + known_sites = [] + reference = [] + + # Handle reference names if LCDB-WF References is ran + if config['ref']['use_references_workflow']: + include: '../references/Snakefile' + refdict = common.references_dict(config) + reference = refdict[config['ref']['organism'][config['ref']['genome']['tag']]['genome']] + aln = refdict[config['ref']['organism'][config['ref']['aligner']['tag']]['bwa']] + aln_index = multiext(os.path.splitext(aln)[0], ".amb", ".ann", ".bwt", ".pac", ".sa") + indexed = refdict[config['ref']['organism'][config['ref']['faidx']['tag']]['faidx']] + if config['ref']['variation']['dbnsfp']: + dbnsfp = refdict[config['ref']['organism']]['variation'][str(config['ref']['variation']['dbnsfp'] + '_' + config['ref']['genome']['build'])] + else: + dbnsfp = [] + if config['ref']['variation']['known']: + known_sites = refdict[config['ref']['organism']][config['ref']['genome']['tag']][config['ref']['variation']['known']] + else: + known_sites = [] + else: + known_sites = ( + config['ref']['paths']['known'] + if config['ref']['paths']['known'] + else [] + ) + reference = config['ref']['paths']['ref'] + indexed = ( + config['ref']['paths']['index'] + if config['ref']['paths']['index'] + else reference + '.fai' + ) + dbnsfp = ( + config['ref']['paths']['dbnsfp'] + if config['ref']['paths']['dbnsfp'] + else [] + ) + aln_index = [] + + # Handle dictionary name, stop the workflow if the fasta file is not named properly. Stop the workflow if there is no reference + if reference == []: + raise ValueError("You must supply a reference file to workflow.") + if reference.endswith('.gz'): + dictionary = '.'.join(reference.split('.')[:-2]) + '.dict' + else: + try: + dictionary ='.'.join(reference.split('.')[:-1]) + '.dict' + # If there is no exception, python will raise a TypeError trying to concatenate an empty list with str + except TypeError: + raise ValueError("There is something wrong with your reference extension. " + "Please make sure your reference has an extension") + # Stop the workflow easily if there is no known variation, but bqsr is set in the config + if config['filtering']['bqsr'] == True: + assert known_sites != [], 'Check your config.yaml. You are requiring that bqsr be run, but there is no known sites vcf' + + return aln_index, dbnsfp, dictionary, indexed, known_sites, reference + + +def get_contigs(): + """ + Helper function to read the contigs from the fasta index checkpoint rule. + These contigs define the regions to split variant calling by for joint-calling. + """ + with checkpoints.genome_index.get().output[0].open() as fai: + ser = pd.read_table(fai, header=None, usecols=[0], dtype=str) + ser = ser.squeeze() + # TODO: make this less brittle, and better support non-Ensembl organisms + # Remove all contigs that don't correspond to a chromosome + ser = ser[ser.apply(lambda x: len(x)) <= 2] + # Remove mitochondiral if specified in the config + if config["processing"]["remove-mitochondrial"]: + return ser[ser != "MT"] + else: + return ser + + +def get_fastq(wildcards): + """ + Get fastq files of given sample-unit. Sample-unit structure is how technical replicates + are handled. This is defined in the sampletable. + """ + fastqs = units.loc[(wildcards.sample, wildcards.unit), ["fq1", "fq2"]].dropna() + if len(fastqs) == 2: + return {"r1": fastqs.fq1, "r2": fastqs.fq2} + return {"r1": fastqs.fq1} + + +def get_read_group(wildcards): + """Denote sample name and platform in read group.""" + return "-R '@RG\\tID:{sample}\\tSM:{sample}\\tPL:{platform}'".format( + sample=wildcards.sample, + platform=units.loc[(wildcards.sample, wildcards.unit), "platform"], + ) + + +def get_recal_input(bai=False): + """ + Handle providing bams the input of the bqsr rules. + Read config options to determine the appropriate bam and bam index files. + If we don't remove duplicates, return the sorted bams from map reads rule. + If duplicates are removed, return the deduplicated bams from the mark duplicates rule. + If a bed file is used in variant calling + """ + # Case 1: no duplicate removal + f = "results/mapped/{sample}-{unit}.sorted.bam" + if config["processing"]["remove-duplicates"]: + # Case 2: remove duplicates + f = "results/dedup/{sample}-{unit}.bam" + if bai: + if config["processing"]["restrict-regions"]: + # Case 3: need an index because random access is required + f += ".bai" + return f + else: + # Case 4: no index needed + return [] + else: + return f + + +def get_sample_bams(wildcards): + """ + Get all aligned reads of given sample. Return the recal bams if bqsr is run + otherwise return the dedup bams. We return all units for a given sample because + we want to provide technical replicates to the variant calling rule where this is called + """ + unitlist = units.loc[wildcards.sample].unit + reslist = [] + if config['filtering']['bqsr']: + reslist.extend( + [ + "results/recal/{}-{}.bam".format(wildcards.sample, unit) for unit in unitlist + ] + ) + else: + reslist.extend( + [ + "results/dedup/{}-{}.bam".format(wildcards.sample, unit) for unit in unitlist + ] + ) + + return reslist + + +def get_sample_unit_bams(wildcards): + """ + Get all aligned reads of given sample. Unlike the function above, we return a single sample-unit combination per function call. + This is because this function is used to QC rules like samtools-stats where we do not want to combine technical replicates. + Return the recal bams if bqsr is run otherwise return the dedup bams + """ + reslist = '' + if config['filtering']['bqsr']: + reslist = "results/recal/{sample}-{unit}.bam".format(sample=wildcards.sample, unit=wildcards.unit) + else: + reslist = "results/dedup/{sample}-{unit}.bam".format(sample=wildcards.sample, unit=wildcards.unit) + return reslist + + +def get_regions_param(regions=config["processing"]["restrict-regions"], default=""): + """ + If a captured regions bedfile is present, split the variant calling up into regions + follwing GATK best practices + """ + if regions: + params = "--intervals '{}' ".format(regions) + padding = config["processing"].get("region-padding") + if padding: + params += "--interval-padding {}".format(padding) + return params + return default + + +def get_call_variants_params(wildcards, input): + """ + Calls the previous function to assemble the regions into interval lists + along with any specified parameters for variant calling in the config + """ + return ( + get_regions_param( + regions=input.regions, default="--intervals {}".format(wildcards.contig) + ) + ) + + +def set_java_opts(resources): + """ + Using the resources directive from the snakemake rule + set the heap size. Request 75 percent of the requested + mem_mb. The remaining 25 percent should be enough for + OS and other system processes that occur outside the shell command + """ + heap = int(resources.mem_mb * 0.75) + heap = int(heap / 1024) + java_temp ='''"-Xmx{}g -Djava.io.tmpdir=$TMPDIR\"''' + java_opts = java_temp.format(heap) + return java_opts + +def all_input_mutect2(): + """ + Format the input for the all rule for mutect2 + """ + comparisons = config['mutect2'].keys() + return expand("results/mutect2_annotated_normed/{comp}.vcf.gz", comp=comparisons) + + +def names_for_somatic(wildcards): + """ + Format the names into arguments to pass to mutect2. + Mutect2 requires you to specify the names of the "normal" samples. + There can be multiple normal samples in a single mutect2 call. + Tumor samples do not need to be named. This will be done by reading + from the config. + """ + comp = wildcards.comp + normals = config['mutect2'][comp]['normal'] + if not isinstance(normals, list): + normals = [normals] + return normals + + +def input_for_somatic(wildcards): + """ + Format the bam input for mutect2 by reading from the config. + Technical replicates are separated and grouped. Returns a dictionary + contains the reference genome, sequence dictionary, and input bams + """ + comp = wildcards.comp + normals = config['mutect2'][comp]['normal'] + if not isinstance(normals, list): + normals = [normals] + tumors = config['mutect2'][comp]['tumor'] + if not isinstance(tumors, list): + tumors = [tumors] + # Fill these lists with paths to tumor and normal files + t_files = [] + n_files = [] + for i in range(len(tumors)): + # Get the unit for each tumor sample + unitlist = units.loc[tumors[i]].unit + if config['filtering']['bqsr']: + t_files.extend( + [ + "results/recal/{}-{}.bam".format(tumors[i], unit) for unit in unitlist + ] + ) + else: + t_files.extend( + [ + "results/dedup/{}-{}.bam".format(tumors[i], unit) for unit in unitlist + ] + ) + # Do the same for Normals + for i in range(len(normals)): + unitlist = units.loc[normals[i]].unit + if config['filtering']['bqsr']: + n_files.extend( + [ + "results/recal/{}-{}.bam".format(normals[i], unit) for unit in unitlist + ] + ) + else: + n_files.extend( + [ + "results/dedup/{}-{}.bam".format(normals[i], unit) for unit in unitlist + ] + ) + + + # Put all the input files needed into a dictionary to pass to the rule + d = dict( + ref=reference, + normals=n_files, + tumors=t_files, + dict=dictionary, + regions=( + "results/called/{contig}.regions.bed".format(contig = wildcards.contig) + if config["processing"]["restrict-regions"] + else [] + ), + ) + return d + + +def get_fai_nomenclature(): + """ + Helper function to get the nomenclature of the fasta index + Returns True if the chr prefix is present, and False if it is absent + """ + nom = False + with checkpoints.genome_index.get().output[0].open() as fai: + for line in fai: + if line.startswith('chr'): + nom = True + break + return nom + + +def get_bed_nomenclature(input): + """ + Helper function to get the nomenclature of the bedfile + Returns True if the chr prefix is present, and False if it is absent + """ + nom = False + with open(input.bed, 'r') as f: + for line in f: + if line.startswith('browser') or line.startswith('track'): + continue + if f.startswith('chr'): + nom = True + break + return nom + + +# vim: ft=python diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index d6bc9d0f6..c1438675d 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -5,7 +5,8 @@ import gzip import yaml import importlib import tempfile -import pandas +from tempfile import TemporaryDirectory +import pandas as pd from snakemake.utils import makedirs from lib.imports import resolve_name from lib import utils @@ -61,6 +62,29 @@ rule unzip: '{references_dir}/logs/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}.log' shell: 'gunzip -c {input} > {output}' +rule bwa_index: + """ + Build bwa index + """ + input: + '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' + output: + protected(aligners.bwa_index_from_prefix('{references_dir}/{organism}/{tag}/genome/bwa/{organism}_{tag}')) + log: + '{references_dir}/logs/{organism}/{tag}/genome/bwa/{organism}_{tag}.log' + resources: + runtime=autobump(hours=3), + mem_mb=gb(24), + disk_mb=gb(24) + run: + prefix=aligners.bwa_prefix_from_index(output) + print(prefix) + shell( + 'bwa index ' + '-p {prefix} ' + '-a bwtsw ' + '{input} ' + '&> {log}') rule bowtie2_index: """ @@ -353,7 +377,7 @@ rule mappings: d['__featuretype__'] = ft res.append(d) - df = pandas.DataFrame(res) + df = pd.DataFrame(res) # Depending on how many attributes there were and the # include_featuretypes settings, this may take a while. @@ -364,4 +388,161 @@ rule mappings: # Restore original setting gffutils.constants.always_return_list = orig_setting + +checkpoint genome_index: + """ + Build fasta index. GATK uses this file for rapid fasta accession as well as + the fasta index is used to identify chromosomes and contigs in the genome + to download the appropriate known variation file + """ + input: + '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' + output: + protected('{references_dir}/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai') + log: + '{references_dir}/logs/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai.log' + resources: + runtime=hours(1), + mem_mb=gb(4) + run: + shell( + 'samtools ' + 'faidx ' + '-o {output} {input} ' + '&> {log}' + ) + + +rule known_variation: + """ + Download all the chromosomes on the ensembl ftp site and combine them to + create a known variation vcf + """ + input: + #fai='{references_dir}/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai' + # can't do it this way since the {tag} wildcard is not congruous between input and output + fai = lambda w: checkpoints.genome_index.get(**w).output[0] + + output: + protected('{references_dir}/{organism}/{tag}/known/{organism}_{tag}.vcf.gz') + log: + '{references_dir}/{organism}/{tag}/known/{organism}_{tag}.known.log' + resources: + runtime=hours(4), + mem_mb=gb(16), + disk_mb=gb(32) + run: + # Get the configuration options in the metadata chunk using wildcards + release = int(config['references'][wildcards.organism][wildcards.tag]['metadata']['release']) + species = config['references'][wildcards.organism][wildcards.tag]['metadata']['species'] + build = config['references'][wildcards.organism][wildcards.tag]['metadata']['build'] + typ = config['references'][wildcards.organism][wildcards.tag]['known']['type'] + def get_contigs(): + with open(input[0], 'r') as fai: + ser = pd.read_table(fai, header=None, usecols=[0], dtype=str) + ser = ser.squeeze() + ser = ser[ser.apply(lambda x: len(x)) <= 2] + return ser + contigs = get_contigs() + branch="" + if release >= 81 and build == "GRCh37": + branch="grch37/" + if typ == "all": + if species == "homo_sapiens" and release >= 93: + suffixes = [ + "-chr{}".format(chrom) for chrom in contigs + ] + else: + suffixes = [""] + elif typ == "somatic": + suffixes=["_somatic"] + elif typ == "structural_variations": + suffixes=["_structural_variations"] + species = species.lower() + release = int(release) + build = build + typ = typ + species_filename=species if release >= 91 else species.capitalize() + urls=[ + "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.{ext}".format( + release=release, + species=species, + suffix=suffix, + species_filename=species_filename, + branch=branch, + ext=ext, + ) + for suffix in suffixes + for ext in ["vcf.gz", "vcf.gz.csi"] + ] + names=[os.path.basename(url) for url in urls if url.endswith(".gz")] + gather = "curl {urls}".format(urls=" ".join(map("-O {}".format, urls))) + with tempfile.TemporaryDirectory() as tmpdir: + if input.get("fai"): + shell( + "(cd {tmpdir}; {gather} && " + "bcftools concat -Oz --naive-force {names} > concat.vcf.gz && " + "bcftools reheader --fai {input.fai} concat.vcf.gz " + "> {output}) && " + "tabix -p vcf {output} 2> {log} " + ) + +if config['references']['human'].get('variation'): + rule dbnsfp: + """ + Download and process dbNSFP database. This involves downloading and + extracting the zip file, then combining the chromosomes to create + a single file. For genome builds like hg19 and GRCh37, some processing + needs to be done to make them compatible with dbNSFP version > 3.X + dbNSFP is only for human genomes. + """ + output: + protected( + '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.vcf.gz'.format( + dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'], + build=config['references']['human']['variation']['dbnsfp']['build'] + ) + ) + log: + '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.log'.format( + dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'], + build=config['references']['human']['variation']['dbnsfp']['build'] + ) + resources: + disk_mb=gb(500), + mem_mb=gb(500), + runtime=hours(8) + threads: 16 + run: + version = config['references']['human'][wildcards.tag]['dbnsfp']['version'] + URL = config['references']['human'][wildcards.tag]['dbnsfp']['url'] + build = config['references']['human'][wildcards.tag]['dbnsfp']['build'] + workdir = wildcards.references_dir + if build == 'GRCh37': + # We need to process the dbNSFP file to make it compatible with older genomes + with tempfile.TemporaryDirectory() as tmpdir: + shell( + '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && ''' + '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && ''' + '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && ''' + '''awk '$8 != "." ' all_chrs > all_chrs_filtered && ''' + '''sort -S 50% --parallel=12 all_chrs_filtered -k8,8 -k9,9n > all_chrs_filtered_sorted && ''' + '''cat h all_chrs_filtered_sorted > all_chrs_filtered_sorted_header && ''' + '''bgzip -c all_chrs_filtered_sorted_header > {output}) && ''' + '''tabix -s 1 -b 2 -e 2 {output} ''' + ) + if build == 'GRCh38': + with tempfile.TemporaryDirectory() as tmpdir: + # No need for processing and we can use the first 2 columns for coordinates + shell( + '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && ''' + '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && ''' + '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && ''' + '''sort -S 50% --parallel=24 all_chrs -k1,1 -k2,2n > all_chrs_sorted && ''' + '''cat h all_chrs_sorted > all_chrs_sorted_header && ''' + '''bgzip -c all_chrs_sorted_header > {output}) && ''' + '''tabix -s 1 -b 2 -e 2 {output} ''' + ) + + # vim: ft=python diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml index 49618dcd0..5f8f2c664 100644 --- a/workflows/references/config/config.yaml +++ b/workflows/references/config/config.yaml @@ -1,6 +1,8 @@ -references_dir: 'references_dir' +references_dir: '/data/NICHD-core0/references' + # See the reference config files in the top level of the repo, # include/reference_configs, for inspiration for more species. include_references: - - '../../include/reference_configs/test.yaml' + # - '../../include/reference_configs/test.yaml' + - '../../include/reference_configs/variant-calling.yaml' diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile new file mode 100644 index 000000000..2496fcfe7 --- /dev/null +++ b/workflows/variant-calling/Snakefile @@ -0,0 +1,911 @@ +import sys +sys.path.insert(0, srcdir('.')) +import pandas as pd +import tempfile +import os +from os import path +import re +from tempfile import TemporaryDirectory,NamedTemporaryFile +from snakemake.shell import shell +import yaml +from textwrap import dedent +from pathlib import Path +from urllib.request import urlretrieve +from zipfile import ZipFile +sys.path.append('../..') +from lib import common, utils, helpers, aligners +from lib.utils import autobump, gb, hours + +configfile: "config/config.yaml" + +include: '../../lib/helpers.smk' + +aln_index, dbnsfp, dictionary, indexed, known_sites, reference = preflight() + +wildcard_constraints: + vartype="snvs|indels", + sample="|".join(samples.index), + unit="|".join(units["unit"]), + comp="|".join(config['mutect2'].keys()) + + +rule all: + input: + "results/annotated/ann.vcf.gz", + "results/qc/multiqc.html", + "results/filtered/all.normed.vcf.gz", + expand("results/somatic_filtered/normed.{comp}.vcf.gz", comp = config['mutect2'].keys()), + expand("results/mutect2_annotated/snpeff.{comp}.vcf.gz", comp = config['mutect2'].keys()), + + + +checkpoint genome_index: + threads: 2 + resources: + mem_mb=gb(4), + runtime=autobump(60) + input: + reference + output: + indexed + log: + 'logs/fasta_index.log' + shell: + "samtools " + "faidx " + "{input} > {output} 2> {log} " + + +rule fasta_dict: + threads: 2 + resources: + mem_mb=gb(4), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + disk_mb=gb(4), + runtime=autobump(60) + input: + ref=reference + output: dictionary + log: "logs/sequence_dictionary.log" + run: + java_opts = set_java_opts(resources) + shell( + 'picard CreateSequenceDictionary {java_opts} \ + -R {input.ref} \ + -O {output} &> {log} ' + ) + + +if not aln_index: + rule bwa_index: + """ + Generate BWA index for the reference genome if we are not using lcdb-wf references workflow + """ + threads: 8 + resources: + disk_mb=gb(24), + mem_mb=gb(24), + runtime=autobump(180) + input: + reference + output: + multiext(reference, ".amb", ".ann", ".bwt", ".pac", ".sa") + log: + "logs/bwa_index.log" + params: + "bwtsw " + shell: + "bwa index -a {params}" + "{input} " + " &> {log}" + + +rule trim_adapters: + threads: 8 + resources: + mem_mb=gb(32), + runtime=autobump(360) + input: unpack(get_fastq), + output: + r1="results/trimmed/{sample}-{unit}.1.fastq.gz", + r2="results/trimmed/{sample}-{unit}.2.fastq.gz", + log: + "logs/{sample}-{unit}_trimming.log" + shell: + 'cutadapt ' + '-o {output.r1} ' + '-p {output.r2} ' + '-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT ' + '--nextseq-trim 20 ' + '--overlap 6 ' + '-j {threads} ' + '--minimum-length 25 ' + '{input.r1} ' + '{input.r2} ' + ' &> {log} ' + + +rule map_reads: + threads: 32 + resources: + disk_mb=gb(40), + mem_mb=gb(48), + runtime=autobump(1920) + input: + reads=["results/trimmed/{sample}-{unit}.1.fastq.gz","results/trimmed/{sample}-{unit}.2.fastq.gz"], + idx=multiext(reference, ".amb", ".ann", ".bwt", ".pac", ".sa"), + output: + bam=temp("results/mapped/{sample}-{unit}.sorted.bam"), + params: + extra=get_read_group, + index=lambda w, input: os.path.splitext(input.idx[0])[0], + log: + "logs/{sample}-{unit}_bwamem.log" + shell: + "bwa mem " + "-t {threads} " + "{params.extra} " + "{params.index} " + "{input.reads} | " + "samtools view -bh | samtools sort -o {output} -O BAM " + "2> {log}" + + +rule mark_duplicates: + """ + If we run bqsr, then we do not need to save the output of mark duplicates, since those bams will + be recalibrated. However, if we don't recalibrate, then we need to save the bams from mark duplicates + and we don't want to mark them as temporary + """ + threads: 4 + resources: + disk_mb=gb(40), + mem_mb=gb(32), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(720) + input: + bam = "results/mapped/{sample}-{unit}.sorted.bam", + output: + metrics="results/qc/picard/markdups/{sample}-{unit}_marked_dup_metrics.txt", + bam=( + temp("results/dedup/{sample}-{unit}.bam") if config['filtering']['bqsr'] + else "results/dedup/{sample}-{unit}.bam" + ) + + log: + "logs/{sample}-{unit}_mark_dup.log" + params: + rm = ('-REMOVE_DUPLICATES true ' + if config['processing']['remove-duplicates'] + else '') + run: + java_opts = set_java_opts(resources) + shell( + 'picard MarkDuplicates ' + '{java_opts} ' + '-INPUT {input.bam} ' + '-OUTPUT {output.bam} ' + '-ASSUME_SORT_ORDER coordinate ' + '{params.rm} ' + '-METRICS_FILE {output.metrics} ' + ' 2> {log} ' + ) + + +if config["filtering"]['bqsr']: + rule base_recalibrator: + threads: 4 + resources: + mem_mb=gb(8), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + disk_mb=gb(40), + runtime=autobump(960) + input: + bam=get_recal_input(bai=False), + bai=get_recal_input(bai=True), + ref=reference, + dict=dictionary, + known=known_sites, + known_idx=known_sites + '.tbi' + output: + recal_table="results/recal/{sample}-{unit}.grp" + log: + "logs/{sample}-{unit}_base_recalibrator.log" + run: + java_opts = set_java_opts(resources) + shell( + 'gatk --java-options {java_opts} BaseRecalibrator \ + -R {input.ref} \ + -I {input.bam} \ + -O {output.recal_table} \ + --known-sites {input.known} 2> {log}' + ) + + + rule apply_bqsr: + threads: 8 + resources: + mem_mb=gb(32), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + disk_mb=gb(40), + runtime=autobump(960) + input: + bam=get_recal_input(bai=False), + bai=get_recal_input(bai=True), + ref=reference, + dict=dictionary, + recal_table="results/recal/{sample}-{unit}.grp", + output: + bam=protected("results/recal/{sample}-{unit}.bam") + log: + "logs/{sample}-{unit}_apply_bsqr.log" + run: + java_opts = set_java_opts(resources) + shell( + 'gatk --java-options {java_opts} ApplyBQSR \ + -R {input.ref} \ + -I {input.bam} \ + --bqsr-recal-file {input.recal_table} \ + -O {output.bam} 2> {log}' + ) + + +rule build_bam_index: + resources: + mem_mb=gb(2), + disk_mb=gb(2), + runtime=autobump(30) + input: + bam ="{prefix}.bam" + output: + "{prefix}.bam.bai" + run: + basename = os.path.basename(input.bam) + log = 'logs/' + os.path.splitext(basename)[0] + '_buildbamindex.log' + shell("samtools index {input.bam} > {output} 2> {log}") + + +if config["processing"]["restrict-regions"]: + rule compose_regions: + """ + This command will ONLY work if the chromosome nomeclature matches the format in the reference genome + That is, chromosomes DO NOT have the 'chr' prefix. + + .bed files are formatted like so: + + Some bed files have header lines that can start with the word 'browser' or 'track' per UCSC + + To check this, we will read the lines of the .bed file and compare them to what is in the fasta index. + If we encounter any mismatches, then we exit with division by zero error and a print + statement that explains the bed file needs to be edited to make the nomenclature match. + + The awk command in the shell statement prints the entire lines of the input bed + into distinct files that are each named by the first column (chromosome) + Basically, we are splitting the provided .bed file up into contigs. + """ + resources: + disk_mb=1024, + mem_mb=1024, + runtime=20 + input: + bed = config["processing"]["restrict-regions"], + output: + "results/called/{contig}.regions.bed" + log: + "logs/{contig}_compose_regions.log" + run: + # Check for nomenclature mismatch using helper function in helpers.smk + chr_fai = get_fai_nomenclature + chr_bed = get_bed_nomenclature + if chr_fai != chr_bed: + raise ValueError("Nomenclature mismatch detected. Please review the fasta index file and the .bed files being used. The chromosome format MUST match between the .bed file and the reference. Please edit the bed file. For GRCh38 genomes, there should be no 'chr' prefix.") + shell(''' awk '$1 == "{wildcards.contig}" {{print $0 >> (t "/" $1 ".regions.bed" )}}' t=results/called {input} ''') + + +rule call_variants: + input: + bam=get_sample_bams, + ref=reference, + dict=dictionary, + known=known_sites, + tbi=( + known_sites + '.tbi' if known_sites else [] + ), + regions=( + "results/called/{contig}.regions.bed" + if config["processing"]["restrict-regions"] + else [] + ), + output: + gvcf=protected("results/called/{sample}.{contig}.g.vcf.gz"), + log: "logs/{sample}_{contig}_call_variants.log" + resources: + disk_mb=gb(16), + mem_mb=gb(40), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(hours=8) + threads: 8 + params: + extra=get_call_variants_params, + pcr='--pcr-indel-model ' + config['processing']['pcr'] + run: + java_opts = set_java_opts(resources) + known = input.known + if known: + known = "--dbsnp " + str(known) + regions = params.extra + bams = input.bam + if isinstance(bams, str): + bams = [bams] + bams = list(map("-I {}".format, bams)) + shell( + 'gatk --java-options {java_opts} HaplotypeCaller {regions} \ + -R {input.ref} \ + {bams} \ + -ERC GVCF \ + {params.pcr} \ + -O {output.gvcf} {known} 2> {log}' + ) + + +rule combine_calls: + threads: 4 + resources: + disk_mb=gb(10), + mem_mb=gb(4), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(720) + input: + ref=reference, + gvcfs=expand("results/called/{sample}.{{contig}}.g.vcf.gz", sample=samples.index), + output: + gvcf="results/called/all.{contig}.g.vcf.gz", + log: "logs/{contig}_combine_calls.log" + run: + java_opts = set_java_opts(resources) + gvcfs=list(map("-V {}".format, input.gvcfs)) + shell( + 'gatk --java-options {java_opts} CombineGVCFs \ + {gvcfs} \ + -R {input.ref} \ + -O {output.gvcf} 2> {log} ' + ) + + +rule genotype_variants: + threads: 4 + resources: + disk_mb=gb(10), + mem_mb=gb(8), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + ref=reference, + idx="results/called/all.{contig}.g.vcf.gz.tbi", + gvcf="results/called/all.{contig}.g.vcf.gz", + output: + vcf=temp("results/genotyped/all.{contig}.vcf.gz"), + log: + "logs/genotypegvcfs.{contig}.log", + run: + java_opts = set_java_opts(resources) + shell( + 'gatk --java-options {java_opts} GenotypeGVCFs ' + '-V {input.gvcf} ' + '-R {input.ref} ' + '-O {output.vcf} 2> {log}' + ) + + +rule merge_variants: + threads: 4 + resources: + disk_mb=gb(10), + mem_mb=gb(8), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + vcfs=lambda w: expand( + "results/genotyped/all.{contig}.vcf.gz", contig=get_contigs() + ), + output: + vcf="results/genotyped/all.vcf.gz", + log: + "logs/merge-genotyped.log", + run: + inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs) + java_opts = set_java_opts(resources) + shell( + 'picard' + ' MergeVcfs' + ' {java_opts}' + ' {inputs}' + ' -OUTPUT {output}' + ' 2> {log}' + ) + + +rule tabix_variants: + threads: 2 + resources: + disk_mb = gb(2), + mem_mb = gb(2), + runtime=autobump(30) + input: + vcf="{prefix}.vcf.gz", + output: + "{prefix}.vcf.gz.tbi", + run: + basename = os.path.basename(input.vcf) + log = 'logs/' + os.path.splitext(basename)[0] + '_tabix.log' + shell("tabix -p vcf {input.vcf} 2> {log} ") + + +rule select_calls: + threads: 4 + resources: + disk_mb=gb(10), + mem_mb=gb(4), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + ref=reference, + vcf="results/genotyped/all.vcf.gz", + output: + vcf=temp("results/filtered/all.{vartype}.vcf.gz"), + log: + "logs/selectvariants_{vartype}.log", + run: + java_opts = set_java_opts(resources) + vartype_arg="--select-type-to-include {}".format( + "SNP" if wildcards.vartype == "snvs" else "INDEL" + ), + shell( + 'gatk --java-options {java_opts} SelectVariants ' + '-R {input.ref} ' + '-V {input.vcf} ' + '{vartype_arg} ' + '-O {output.vcf} 2> {log}' + ) + + +rule hard_filter_calls: + threads: 4 + resources: + disk_mb=gb(10), + mem_mb=gb(4), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + ref=reference, + vcf="results/filtered/all.{vartype}.vcf.gz", + output: + vcf=temp("results/filtered/all.{vartype}.hardfiltered.vcf.gz"), + log: + "logs/variantfiltration_{vartype}.log", + run: + java_opts = set_java_opts(resources) + filter_arg = {'snv_hard_filer' : config['filtering']['hard'][wildcards.vartype]} + filters = [ + "--filter-name {} --filter-expression '{}'".format(name, expr.replace("'", "\\'")) + for name, expr in filter_arg.items() + ] + shell( + 'gatk --java-options {java_opts} VariantFiltration ' + '-R {input.ref} ' + '-V {input.vcf} ' + '{filters} ' + '-O {output.vcf} 2> {log}' + ) + + +rule merge_calls: + threads: 2 + resources: + disk_mb=gb(10), + mem_mb=gb(8), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(480) + input: + vcfs=expand( + "results/filtered/all.{vartype}.{filtertype}.vcf.gz", + vartype=["snvs", "indels"], filtertype='hardfiltered', + ), + output: + vcf=temp("results/filtered/all.final.vcf.gz"), + log: + "logs/merge-filtered.log", + run: + inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs) + java_opts = set_java_opts(resources) + shell( + 'picard ' + ' MergeVcfs' + ' {java_opts}' + ' {inputs}' + ' -OUTPUT {output}' + ' 2> {log}' + ) + + +rule norm: + """ + Split multiallielic variants into multiple biallelic ones. + """ + resources: + mem_mb = gb(16), + runtime = autobump(120) + input: + ref=reference, + vcf="results/filtered/all.final.vcf.gz" + output: + "results/filtered/all.normed.vcf.gz" + log: + "logs/norm-vcf.log" + shell: + "bcftools norm -f {input.ref} " + "-m- " + "{input.vcf} " + "--output-type z " + "--output {output} 2> {log}" + + +rule fastqc: + resources: + mem_mb=gb(12), + runtime=autobump(120), + threads: 8 + input: + unpack(get_fastq), + output: + html="results/qc/fastqc/data/{sample}-{unit}_fastqc.html", + zip="results/qc/fastqc/zip/{sample}-{unit}_fastqc.zip" + log: + "logs/{sample}-{unit}_fastqc.log" + run: + def base_file(file_path): + baseName = Path(path.basename(file_path)) + while baseName.suffix in {'.gz','.bz2','.txt','.fastq','.fq','.sam','.bam'}: + baseName = baseName.with_suffix('') + return str(baseName) + with TemporaryDirectory() as tempdir: + shell( + "fastqc " + "--threads {threads} " + "--noextract " + "--quiet " + "--outdir {tempdir:q} " + "{input:q} " + "&> {log} " + ) + output_base = base_file(input[0]) + html_path = path.join(tempdir, output_base + "_fastqc.html") + zip_path = path.join(tempdir, output_base + "_fastqc.zip") + if output.html != html_path: + shell("mv {html_path:q} {output.html:q}") + if output.zip != zip_path: + shell("mv {zip_path:q} {output.zip:q}") + + +rule samtools_stats: + """ + Run samtools stats + """ + resources: + mem_mb = gb(16), + runtime = autobump(120) + input: + get_sample_unit_bams + output: + "results/qc/samtools-stats/{sample}-{unit}.txt" + log: + "logs/samtools-stats_{sample}-{unit}.log" + shell: + "samtools stats {input} 1> {output} 2> {log} " + + +snpeff_input_for_multiqc = [] +if config['snpeff']['germline']: + snpeff_input_for_multiqc.append('results/qc/snpEff_summary.csv') +if config['snpeff']['somatic']: + soms = expand('results/qc/snpEff_{comp}_summary.csv', comp = config['mutect2'].keys()) + snpeff_input_for_multiqc.extend(soms) + + + +rule multiqc: + """ + Gather qc metrics and run MultiQC + Get the html output from somatic and germline VCF annotation if specified in the config. + """ + resources: + mem_mb=gb(4), + runtime=autobump(60) + input: + fastqc=expand("results/qc/fastqc/zip/{u.sample}-{u.unit}_fastqc.zip", u=units.itertuples()), + markdup=expand("results/qc/picard/markdups/{u.sample}-{u.unit}_marked_dup_metrics.txt", u=units.itertuples()), + samstats=expand("results/qc/samtools-stats/{u.sample}-{u.unit}.txt", u=units.itertuples()), + snpeff=snpeff_input_for_multiqc + output: + "results/qc/multiqc.html", + params: + dirname="results/qc/", + name="multiqc.html", + log: + "logs/multiqc.log", + run: + input_dirs=params.dirname + shell( + "multiqc " + "--force " + "-o {params.dirname} " + "-n {params.name} " + "{input_dirs} " + " &> {log} " + ) + + +rule snpeff: + """ + Annotate variants with SnpEff + """ + resources: + disk_mb=gb(20), + mem_mb=gb(16), + # mem_mb = gb(5.5) # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + vcf='results/filtered/all.normed.vcf.gz', + log: 'logs/snpeff.log' + output: + ann='results/annotated/ann.vcf.gz', + stats='results/qc/snpEff_summary.csv', + html='results/qc/snpEff_summary.html' + params: + annotations = config['snpeff']['annotations'], + gen = config['snpeff']['genome'] + run: + java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024)) + shell( + "snpEff {java_opts} " + "-o vcf " + "-csvStats {output.stats} " + "-stats {output.html} " + "{params.gen} {input.vcf} " + "| bcftools view -Oz > {output.ann} 2> {log} " + ) + dbnsfp_arg = [] + if dbnsfp: + dbnsfp_arg = "DbNsfp -db {}".format(dbnsfp) + if dbnsfp_arg: + sift_output = 'results/annotated/dbnsfp.ann.vcf.gz' + field_arg = ( + "-f '{}'".format(params.annotations) + if params.annotations + else '' + ) + + shell( + "SnpSift {java_opts} " + "{dbnsfp_arg} " + "{field_arg} {output.ann} " + "| bcftools view -Oz > {sift_output} 2>> {log} " + ) + + +rule mutect2: + """ + Use Mutect2 to call variants on individual samples, one per contig + """ + resources: + disk_mb = gb(40), + mem_mb = gb(32), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime = autobump(720) + input: + unpack(input_for_somatic) + output: + vcf="results/mutect2_called/raw.{comp}.{contig}.vcf.gz", + stats='results/mutect2_called/raw.{comp}.{contig}.vcf.gz.stats', + orientation='results/lrom/{contig}_{comp}.tar.gz' + log: + "logs/{comp}_{contig}_mutect2_call_variants.log" + params: + pon = ( + '--panel-of-normals ' + config['mutect2']['PON'] + if config['PON'] + else [] + ), + extra=get_call_variants_params, + pcr='--pcr-indel-model ' + config['processing']['pcr'] + run: + java_opts = set_java_opts(resources) + normals = " ".join("-I {} ".format(n) for n in input.normals) + tumors = " ".join("-I {} ".format(t) for t in input.tumors) + names = names_for_somatic(wildcards) + formatted_names = " ".join('-normal {} '.format(name) for name in names) + shell( + "gatk Mutect2 " + "--java-options {java_opts} " + "-R {input.ref} " + "{normals} " + "{tumors} " + "{params.extra} " + "{formatted_names} " + "{params.pcr} " + "--f1r2-tar-gz {output.orientation} " + "{params.pon} " + "-O {output.vcf} 2> {log}" + ) + + +rule lrom: + """ + Run LearnReadOrientationModel to get the maximum likelihood estimates of artifact prior probabilities + in the orientation bias mixture model filter + """ + resources: + disk_mb = gb(20), + mem_mb = gb(32), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime = autobump(120) + input: + orientation=lambda w: expand('results/lrom/{contig}_{{comp}}.tar.gz', contig=get_contigs()) + output: + lrom='results/lrom/artifact-prior-{comp}.tar.gz' + log: + 'logs/lrom_{comp}.log' + run: + java_opts = set_java_opts(resources) + def get_format_lrom(): + names= ['-I {} '.format(i) for i in input.orientation] + names = ' '.join(names) + return names + lrom_names = get_format_lrom() + + shell( + 'gatk --java-options {java_opts} LearnReadOrientationModel {lrom_names} ' + '-O {output.lrom} &> {log}' + ) + + +rule merge_mutect2_variants: + """ + After individual contigs are called via mutect2, we merge them together here. + """ + resources: + disk_mb = gb(20), + mem_mb = gb(32), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime = autobump(120) + input: + vcfs=lambda w: expand( + "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz", contig=get_contigs() + ), + output: + temp("results/somatic/merged.{comp}.vcf.gz") + log: + "logs/merge_mutect2.{comp}.log", + run: + inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs) + java_opts = set_java_opts(resources) + shell( + 'picard' + ' MergeVcfs' + ' {java_opts}' + ' {inputs}' + ' -OUTPUT {output}' + ' &> {log}' + ) + + +rule merge_mutect2_stats: + """ + Just like merging VCFs for Mutect2, we also need to merge stats for filtering. + """ + resources: + disk_mb = gb(20), + mem_mb = gb(16), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime = autobump(120) + input: + stats=lambda w: expand( + "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz.stats", contig=get_contigs() + ), + output: + temp("results/somatic/merged.{comp}.vcf.gz.stats") + log: + "logs/merge_mutect2_stats.{comp}.log" + run: + java_opts = set_java_opts(resources) + inputs = " ".join(" -stats {} ".format(f) for f in input.stats) + shell( + "gatk MergeMutectStats " + "--java-options {java_opts} " + "{inputs} " + "-O {output} " + "&> {log}" + ) + + +rule filter_mutect2_calls: + """ + New versions of Mutect2 have optimized defaults for filtering; we can just use those. + """ + resources: + disk_mb = gb(20), + mem_mb = gb(16), + # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + runtime = autobump(120) + input: + ref=reference, + unfiltered="results/somatic/merged.{comp}.vcf.gz", + stats="results/somatic/merged.{comp}.vcf.gz.stats", + lrom='results/lrom/artifact-prior-{comp}.tar.gz' + output: + "results/somatic_filtered/filtered.{comp}.vcf.gz" + log: + "logs/{comp}.vcf.gz.log" + run: + java_opts = set_java_opts(resources) + shell( + "gatk FilterMutectCalls " + "--java-options {java_opts} " + "-stats {input.stats} " + "--orientation-bias-artifact-priors {input.lrom} " + "-R {input.ref} " + "-V {input.unfiltered} " + "-O {output}" + ) + + +rule mutect2_norm: + """ + Split multiallielic variants into multiple biallelic ones. + """ + resources: + mem_mb = gb(16), + runtime = autobump(120) + input: + ref=reference, + vcf="results/somatic_filtered/filtered.{comp}.vcf.gz" + output: + "results/somatic_filtered/normed.{comp}.vcf.gz" + log: + "logs/norm-{comp}-vcf.log" + shell: + "bcftools norm -f {input.ref} " + "-m- " + "{input.vcf} " + "--output-type z " + "--output {output} 2> {log}" + + +rule snpeff_cancer: + """ + Annotate somatic variants with SnpEff Cancer + """ + resources: + disk_mb=gb(20), + mem_mb=gb(16), + # mem_mb = gb(5.5) # [ TEST SETTINGS -1 ] + runtime=autobump(120) + input: + vcf='results/somatic_filtered/normed.{comp}.vcf.gz', + output: + vcf='results/mutect2_annotated/snpeff.{comp}.vcf.gz', + stats='results/qc/snpEff_{comp}_summary.csv', + html='results/qc/snpEff_{comp}.html' + log: + 'logs/cancer_snpeff_{comp}.log' + params: + snpeff_genome = config['snpeff']['genome'] + run: + java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024)) + shell( + 'snpEff {java_opts} -v -o vcf -cancer -csvStats {output.stats} \ + -stats {output.html} {params.snpeff_genome} {input.vcf} | bcftools view -Oz > {output.vcf}' + ) + + +# vim: ft=python diff --git a/workflows/variant-calling/config/config.yaml b/workflows/variant-calling/config/config.yaml new file mode 100644 index 000000000..4b3e71bd2 --- /dev/null +++ b/workflows/variant-calling/config/config.yaml @@ -0,0 +1,74 @@ +samples: config/samples.tsv +units: config/units.tsv +ref: + # Set to true only if you want your references to come from lcdb-wf references + use_references_workflow: false + # Match these to the reference config that is included at the bottom of this file + # Only configure this section if use_references_workflow is set to true + organism: 'human' + genome: + tag: 'ensembl-104' + build: 'GRCh38' + aligner: + index: 'bwa' + tag: 'ensembl-104' + faidx: + index: 'faidx' + tag: 'ensembl-104' + variation: + known: 'known' + dbnsfp: 'dbNSFPv4.4' + # If you are providing your own references, include their paths here. + paths: + # When using BWA, you should not use a top level genome assembly for human, see http://lh3.github.io/2017/11/13/which-human-reference-genome-to-use + ref: 'references/GRCh38.6.20.fa.gz' + known: 'references/known_variation_noiupac.vcf.gz' + index: + dbnsfp: 'references/dbnsfp_6_20.vcf.gz' +processing: + remove-duplicates: true + remove-mitochondrial: true + # See https://gatk.broadinstitute.org/hc/en-us/articles/360036465912-HaplotypeCaller#--pcr-indel-model for pcr + pcr: "NONE" + # Point to a bed file, e.g. captured regions + restrict-regions: 'references/exons_subset.bed' + # If regions are restricted, optionally enlarge them by a given value + region-padding: +filtering: + # Set to true in order to apply machine learning based recalibration of + # quality scores instead of hard filtering. + bqsr: true + hard: + # hard filtering as outlined in GATK docs + # (https://gatkforums.broadinstitute.org/gatk/discussion/2806/howto-apply-hard-filters-to-a-call-set) + snvs: + "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" + indels: + "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" +snpeff: + # MultiQC rule needs these set to collect summary files from the snpeff and snpeff_cancer rule + # Set each to true respectively if you plan on generating annotations for somatic or germline data + somatic: true + germline: true + # Run snpEff databases to see available databases (https://pcingola.github.io/SnpEff/se_commandline/) + # See https://pcingola.github.io/SnpEff/se_build_db/ for docs on building your own database + genome: 'GRCh38.p14' + # Add annotations in the form of a comma-separated string to attach from dbnsfp for snpsift. + # These annotations should be column names in the dbnsfp file. + # Leave this blank if you are not using dbnsfp. + annotations: 'FATHMM_pred,SIFT_pred' +# Supple a panel of normals file if you have one for your genome. It is OK to leave this blank. +# See https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- for details on the PON file. +PON: +mutect2: + # See the docs on how to configure this section + tumor-normal: + tumor: + - 'tumor' + normal: + - 'normal' + + + +include_references: + - '../../include/reference_configs/variant-calling.yaml' diff --git a/workflows/variant-calling/config/samples.tsv b/workflows/variant-calling/config/samples.tsv new file mode 100644 index 000000000..21c6b82e8 --- /dev/null +++ b/workflows/variant-calling/config/samples.tsv @@ -0,0 +1,3 @@ +sample +tumor +normal diff --git a/workflows/variant-calling/config/units.tsv b/workflows/variant-calling/config/units.tsv new file mode 100644 index 000000000..3d6d261d8 --- /dev/null +++ b/workflows/variant-calling/config/units.tsv @@ -0,0 +1,4 @@ +sample unit platform fq1 fq2 +tumor 1 Illumina data/example_data/tumor_R1.fq.gz data/example_data/tumor_R2.fq.gz +tumor 2 Illumina data/example_data/test_unit_R1.fq.gz data/example_data/test_unit_R2.fq.gz +normal 1 Illumina data/example_data/normal_R1.fq.gz data/example_data/normal_R2.fq.gz From a34c9d3f81fe427bdf4a9d1e54b8a33493ea446e Mon Sep 17 00:00:00 2001 From: fridellsa Date: Thu, 1 Jun 2023 18:18:11 -0400 Subject: [PATCH 02/32] initial testing infrastructure for VC --- .circleci/config.yml | 27 ++++++++ test/lcdb-wf-test | 97 +++++++++++++++++++++++++-- workflows/variant-calling/run_test.sh | 3 + 3 files changed, 123 insertions(+), 4 deletions(-) create mode 100755 workflows/variant-calling/run_test.sh diff --git a/.circleci/config.yml b/.circleci/config.yml index e0e743334..667077b22 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -289,6 +289,22 @@ variables: conda activate $LCDBWF_ENV $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -r -p -j2 --use-conda --orig $ORIG + + # -------------------------------------------------------------------------- + # REVIEW: not RNA-seq + # Standard RNA-seq workflow + variantcalling-step: &variantcalling-step + run: + name: variantcalling workflow + command: | + cd $DEPLOY + source /opt/mambaforge/etc/profile.d/conda.sh + conda activate $LCDBWF_ENV + $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow -n + $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow --use-conda -j2 + + tar -zcf variantcalling.tar.gz workflows/variant-calling/results + # -------------------------------------------------------------------------- # Syntax note: All of the steps above, with their "&step-name" labels, can be # referred to by a corresponding "*step-name" below. The "<<: *defaults" @@ -387,6 +403,17 @@ jobs: destination: gene-patterns.html + variantcalling: + <<: *defaults + steps: + - checkout + - *restore_cache + - *set-path + - *get-data + - *variantcalling-step + - store_artifacts: + path: /tmp/lcdb-wf-test/workflows/variant-calling/variantcalling.tar.gz + rnaseq-misc: <<: *defaults steps: diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test index 7f92cb2b4..f215433d5 100755 --- a/test/lcdb-wf-test +++ b/test/lcdb-wf-test @@ -67,6 +67,8 @@ class Runner(object): %(prog)s rnaseq --downstream %(prog)s chipseq --run-workflow %(prog)s references --run-workflow --configfile=config/config.yaml + %(prog)s variantcalling --run-workflow + DATA ---- @@ -130,7 +132,7 @@ class Runner(object): parser.add_argument( "--kind", default="all", - choices=["all", "rnaseq", "chipseq"], + choices=["all", "rnaseq", "chipseq", "variantcalling"], help="Kind of data to download", ) parser.add_argument( @@ -144,8 +146,22 @@ class Runner(object): args = parser.parse_args(sys.argv[2:]) - repo = "lcdb-test-data" - URL = f"https://github.com/lcdb/{repo}/blob/{args.branch}/data/{{}}?raw=true" + # Create a repo lookup for the different assays + # For variantcalling, the `args.branch` should be "main" instead of "master", unless we can fix this + repo_lookup = { + 'rnaseq': { + 'repo': "lcdb-test-data", + 'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true" + }, + 'chipseq': { + 'repo': "lcdb-test-data", + 'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true" + }, + 'variantcalling': { + 'repo': 'lcdb-wf-variant-calling-test-data', + 'URL': f"https://github.com/lcdb/{{repo}}/blob/{args.branch}/data/{{}}?raw=true" + } + } # This dict maps files in the `data` directory of the repo to a local # path to which it should be downloaded. @@ -214,6 +230,41 @@ class Runner(object): "workflows/chipseq/data/example_data/chipseq_ip4.fq.gz", ), ], + "variantcalling": [ + ( + "GRCh38.6.20.fa.gz", + "workflows/variant-calling/references/GRCh38.6.20.fa.gz", + ), + ( + "known_variation_noiupac.vcf.gz", + "workflows/variant-calling/references/known_variation_noiupac.vcf.gz" + + ), + ( + "normal_R1.6.20.fq.gz", + "workflows/variant-calling/data/example_data/normal_R1.fq.gz" + ), + ( + "normal_R2.6.20.fq.gz", + "workflows/variant-calling/data/example_data/normal_R2.fq.gz" + ), + ( + "tumor_R1.6.20.fq.gz", + "workflows/variant-calling/data/example_data/tumor_R1.fq.gz" + ), + ( + "tumor_R2.6.20.fq.gz", + "workflows/variant-calling/data/example_data/tumor_R2.fq.gz" + ), + ( + "dbnsfp_6_20.vcf.gz", + "workflows/variant-calling/references/dbnsfp_6_20.vcf.gz" + ), + ( + "dbnsfp_6_20.vcf.gz.tbi", + "workflows/variant-calling/references/dbnsfp_6_20.vcf.gz.tbi" + ), + ] } if args.kind == "all": @@ -222,7 +273,7 @@ class Runner(object): kinds = [args.kind] for kind in kinds: for fn, dest in data_files[kind]: - url = URL.format(fn) + url = repo_lookup[kind]['URL'].format(fn, repo=repo_lookup[kind]['repo']) if args.verbose: print(f"downloading {url}") if dest is None: @@ -504,6 +555,44 @@ class Runner(object): executable="/bin/bash" ) + def _cmd_variantcalling(self): + """ + This function handles the "variantcalling" subcommand. + """ + + parser = argparse.ArgumentParser( + description="Run variant calling workflow and downstream tests", + parents=[self.global_parser], + ) + parser.add_argument( + "--run-workflow", + action="store_true", + help="""Run variant workflow using run_tesh.sh, which runs preprocess.py + on the snakefile, converting it to a test file to be run.""", + ) + + + workflow_prefix = "bash run_test.sh" + workflow_dir = TOPLEVEL / "workflows/variant-calling" + args, extra = parser.parse_known_args(sys.argv[2:]) + + if args.run_workflow: + print(args) + extra = [i.replace("__ORIG__", args.orig) for i in extra] + strargs = " ".join(extra) + cmd = ( + 'eval "$(conda shell.bash hook)" ' + f"&& conda activate {args.env} " + f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})" + ) + print_header(f"Running the following command:\n{cmd}") + sp.run( + cmd, + check=True, + shell=True, + executable="/bin/bash" + ) + if __name__ == "__main__": Runner() diff --git a/workflows/variant-calling/run_test.sh b/workflows/variant-calling/run_test.sh new file mode 100755 index 000000000..7aacb413c --- /dev/null +++ b/workflows/variant-calling/run_test.sh @@ -0,0 +1,3 @@ +set -e +python -m doctest ../../ci/preprocessor.py +python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test "$@" From 3fa096ca2a05ba324d380bb4b5db584d11bd69fe Mon Sep 17 00:00:00 2001 From: fridellsa Date: Thu, 1 Jun 2023 18:35:13 -0400 Subject: [PATCH 03/32] 'typo' --- workflows/variant-calling/Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile index 2496fcfe7..a2d5b04c6 100644 --- a/workflows/variant-calling/Snakefile +++ b/workflows/variant-calling/Snakefile @@ -652,7 +652,7 @@ rule snpeff: resources: disk_mb=gb(20), mem_mb=gb(16), - # mem_mb = gb(5.5) # [ TEST SETTINGS -1 ] + # mem_mb = gb(5.5), # [ TEST SETTINGS -1 ] runtime=autobump(120) input: vcf='results/filtered/all.normed.vcf.gz', @@ -888,7 +888,7 @@ rule snpeff_cancer: resources: disk_mb=gb(20), mem_mb=gb(16), - # mem_mb = gb(5.5) # [ TEST SETTINGS -1 ] + # mem_mb = gb(5.5), # [ TEST SETTINGS -1 ] runtime=autobump(120) input: vcf='results/somatic_filtered/normed.{comp}.vcf.gz', From 3f23aa07b44bcae7f918516207fdff34ccfd939f Mon Sep 17 00:00:00 2001 From: fridellsa Date: Thu, 1 Jun 2023 18:36:40 -0400 Subject: [PATCH 04/32] 'typo in test setting' --- workflows/variant-calling/Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile index a2d5b04c6..e608dc7a8 100644 --- a/workflows/variant-calling/Snakefile +++ b/workflows/variant-calling/Snakefile @@ -652,7 +652,7 @@ rule snpeff: resources: disk_mb=gb(20), mem_mb=gb(16), - # mem_mb = gb(5.5), # [ TEST SETTINGS -1 ] + # mem_mb = gb(5), # [ TEST SETTINGS -1 ] runtime=autobump(120) input: vcf='results/filtered/all.normed.vcf.gz', @@ -888,7 +888,7 @@ rule snpeff_cancer: resources: disk_mb=gb(20), mem_mb=gb(16), - # mem_mb = gb(5.5), # [ TEST SETTINGS -1 ] + # mem_mb = gb(5), # [ TEST SETTINGS -1 ] runtime=autobump(120) input: vcf='results/somatic_filtered/normed.{comp}.vcf.gz', From 880a8a084005d41aa13d4bfc36acf01b87899ffb Mon Sep 17 00:00:00 2001 From: fridellsa Date: Thu, 1 Jun 2023 20:07:35 -0400 Subject: [PATCH 05/32] removed bed from config --- workflows/variant-calling/config/config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/variant-calling/config/config.yaml b/workflows/variant-calling/config/config.yaml index 4b3e71bd2..50505c627 100644 --- a/workflows/variant-calling/config/config.yaml +++ b/workflows/variant-calling/config/config.yaml @@ -31,7 +31,8 @@ processing: # See https://gatk.broadinstitute.org/hc/en-us/articles/360036465912-HaplotypeCaller#--pcr-indel-model for pcr pcr: "NONE" # Point to a bed file, e.g. captured regions - restrict-regions: 'references/exons_subset.bed' + restrict-regions: + #'references/exons_subset.bed' # If regions are restricted, optionally enlarge them by a given value region-padding: filtering: From e0793b505905f6713c3b1d1be0d94924ec4191f6 Mon Sep 17 00:00:00 2001 From: fridellsa Date: Thu, 1 Jun 2023 20:25:16 -0400 Subject: [PATCH 06/32] fixed typo in helpers checking chr nomenclature --- lib/helpers.smk | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/helpers.smk b/lib/helpers.smk index 63e717932..4188b0654 100644 --- a/lib/helpers.smk +++ b/lib/helpers.smk @@ -324,9 +324,11 @@ def get_bed_nomenclature(input): nom = False with open(input.bed, 'r') as f: for line in f: - if line.startswith('browser') or line.startswith('track'): + if line.startswith('browser'): continue - if f.startswith('chr'): + if line.startswith('track'): + continue + if line.startswith('chr'): nom = True break return nom From 5bb3726ec16802dc588cd4d8d0221c66c316f404 Mon Sep 17 00:00:00 2001 From: fridellsa Date: Thu, 1 Jun 2023 20:29:00 -0400 Subject: [PATCH 07/32] commenting out dbnsfp rule --- workflows/references/Snakefile | 114 ++++++++++++++++----------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index c1438675d..35583251c 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -487,62 +487,62 @@ rule known_variation: "tabix -p vcf {output} 2> {log} " ) -if config['references']['human'].get('variation'): - rule dbnsfp: - """ - Download and process dbNSFP database. This involves downloading and - extracting the zip file, then combining the chromosomes to create - a single file. For genome builds like hg19 and GRCh37, some processing - needs to be done to make them compatible with dbNSFP version > 3.X - dbNSFP is only for human genomes. - """ - output: - protected( - '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.vcf.gz'.format( - dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'], - build=config['references']['human']['variation']['dbnsfp']['build'] - ) - ) - log: - '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.log'.format( - dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'], - build=config['references']['human']['variation']['dbnsfp']['build'] - ) - resources: - disk_mb=gb(500), - mem_mb=gb(500), - runtime=hours(8) - threads: 16 - run: - version = config['references']['human'][wildcards.tag]['dbnsfp']['version'] - URL = config['references']['human'][wildcards.tag]['dbnsfp']['url'] - build = config['references']['human'][wildcards.tag]['dbnsfp']['build'] - workdir = wildcards.references_dir - if build == 'GRCh37': - # We need to process the dbNSFP file to make it compatible with older genomes - with tempfile.TemporaryDirectory() as tmpdir: - shell( - '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && ''' - '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && ''' - '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && ''' - '''awk '$8 != "." ' all_chrs > all_chrs_filtered && ''' - '''sort -S 50% --parallel=12 all_chrs_filtered -k8,8 -k9,9n > all_chrs_filtered_sorted && ''' - '''cat h all_chrs_filtered_sorted > all_chrs_filtered_sorted_header && ''' - '''bgzip -c all_chrs_filtered_sorted_header > {output}) && ''' - '''tabix -s 1 -b 2 -e 2 {output} ''' - ) - if build == 'GRCh38': - with tempfile.TemporaryDirectory() as tmpdir: - # No need for processing and we can use the first 2 columns for coordinates - shell( - '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && ''' - '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && ''' - '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && ''' - '''sort -S 50% --parallel=24 all_chrs -k1,1 -k2,2n > all_chrs_sorted && ''' - '''cat h all_chrs_sorted > all_chrs_sorted_header && ''' - '''bgzip -c all_chrs_sorted_header > {output}) && ''' - '''tabix -s 1 -b 2 -e 2 {output} ''' - ) - +#if config['references']['human'].get('variation'): +# rule dbnsfp: +# """ +# Download and process dbNSFP database. This involves downloading and +# extracting the zip file, then combining the chromosomes to create +# a single file. For genome builds like hg19 and GRCh37, some processing +# needs to be done to make them compatible with dbNSFP version > 3.X +# dbNSFP is only for human genomes. +# """ +# output: +# protected( +# '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.vcf.gz'.format( +# dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'], +# build=config['references']['human']['variation']['dbnsfp']['build'] +# ) +# ) +# log: +# '{{references_dir}}/{{organism}}/{{tag}}/{dbnsfp_version}_{build}/{{organism}}_{{tag}}.log'.format( +# dbnsfp_version=config['references']['human']['variation']['dbnsfp']['version'], +# build=config['references']['human']['variation']['dbnsfp']['build'] +# ) +# resources: +# disk_mb=gb(500), +# mem_mb=gb(500), +# runtime=hours(8) +# threads: 16 +# run: +# version = config['references']['human'][wildcards.tag]['dbnsfp']['version'] +# URL = config['references']['human'][wildcards.tag]['dbnsfp']['url'] +# build = config['references']['human'][wildcards.tag]['dbnsfp']['build'] +# workdir = wildcards.references_dir +# if build == 'GRCh37': +# # We need to process the dbNSFP file to make it compatible with older genomes +# with tempfile.TemporaryDirectory() as tmpdir: +# shell( +# '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && ''' +# '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && ''' +# '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && ''' +# '''awk '$8 != "." ' all_chrs > all_chrs_filtered && ''' +# '''sort -S 50% --parallel=12 all_chrs_filtered -k8,8 -k9,9n > all_chrs_filtered_sorted && ''' +# '''cat h all_chrs_filtered_sorted > all_chrs_filtered_sorted_header && ''' +# '''bgzip -c all_chrs_filtered_sorted_header > {output}) && ''' +# '''tabix -s 1 -b 2 -e 2 {output} ''' +# ) +# if build == 'GRCh38': +# with tempfile.TemporaryDirectory() as tmpdir: +# # No need for processing and we can use the first 2 columns for coordinates +# shell( +# '''(cd {tmpdir}; wget -O- {URL} > dbnsfp.zip && ''' +# '''unzip dbnsfp.zip && zcat dbNSFP*_variant.chr1* | awk "NR<=1" > h && ''' +# '''zgrep -v "^#" dbNSFP*_variant.chr* > all_chrs && ''' +# '''sort -S 50% --parallel=24 all_chrs -k1,1 -k2,2n > all_chrs_sorted && ''' +# '''cat h all_chrs_sorted > all_chrs_sorted_header && ''' +# '''bgzip -c all_chrs_sorted_header > {output}) && ''' +# '''tabix -s 1 -b 2 -e 2 {output} ''' +# ) +# # vim: ft=python From a1f69ed9c1836d6d6f5bb73ea633b5d4533d17ae Mon Sep 17 00:00:00 2001 From: daler Date: Fri, 2 Jun 2023 10:16:37 -0400 Subject: [PATCH 08/32] disable references test for now --- .circleci/config.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 667077b22..47fe05359 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -513,10 +513,10 @@ workflows: requires: - initial-setup - pytest - - references: - requires: - - initial-setup - - pytest + # - references: + # requires: + # - initial-setup + # - pytest - colocalization: requires: - initial-setup From ac1d3dd5c1a50c1b6e66a9b23b84e7430028379b Mon Sep 17 00:00:00 2001 From: daler Date: Fri, 2 Jun 2023 10:32:15 -0400 Subject: [PATCH 09/32] temporarily disable all but initial setup & variant calling tests --- .circleci/config.yml | 65 +++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 47fe05359..389b43ea1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -497,38 +497,41 @@ workflows: - pytest: requires: - initial-setup - - chipseq: - requires: - - initial-setup - - pytest - - chipseq-misc: - requires: - - initial-setup - - pytest - - rnaseq: - requires: - - initial-setup - - pytest - - rnaseq-misc: - requires: - - initial-setup - - pytest + # - chipseq: + # requires: + # - initial-setup + # - pytest + # - chipseq-misc: + # requires: + # - initial-setup + # - pytest + # - rnaseq: + # requires: + # - initial-setup + # - pytest + # - rnaseq-misc: + # requires: + # - initial-setup + # - pytest # - references: + # requires: + # - initial-setup + # - pytest + # - colocalization: # requires: # - initial-setup # - pytest - - colocalization: - requires: - - initial-setup - - pytest - - build-docs: - requires: - - initial-setup - - report-env: - requires: - - rnaseq - - rnaseq-misc - - chipseq - - chipseq-misc - - references - - colocalization + # - build-docs: + # requires: + # - initial-setup + # - report-env: + # requires: + # - rnaseq + # - rnaseq-misc + # - chipseq + # - chipseq-misc + # - references + # - colocalization + - variantcalling: + requires: + - initial-setup From 9a8d8effbf201bd9dee09a50d687cc7a5cbe2628 Mon Sep 17 00:00:00 2001 From: daler Date: Fri, 2 Jun 2023 10:33:19 -0400 Subject: [PATCH 10/32] fix circleci yaml config syntax --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 389b43ea1..906a7bd00 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -533,5 +533,5 @@ workflows: # - references # - colocalization - variantcalling: - requires: - - initial-setup + requires: + - initial-setup From 4d4a261a6a25c32f1fd106d27bb9bc7060cb9881 Mon Sep 17 00:00:00 2001 From: daler Date: Fri, 2 Jun 2023 10:44:30 -0400 Subject: [PATCH 11/32] more syntax fixes --- .circleci/config.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 906a7bd00..064323883 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -412,7 +412,7 @@ jobs: - *get-data - *variantcalling-step - store_artifacts: - path: /tmp/lcdb-wf-test/workflows/variant-calling/variantcalling.tar.gz + path: /tmp/lcdb-wf-test/workflows/variant-calling/variantcalling.tar.gz rnaseq-misc: <<: *defaults @@ -494,9 +494,9 @@ workflows: test-suite: jobs: - initial-setup - - pytest: - requires: - - initial-setup + # - pytest: + # requires: + # - initial-setup # - chipseq: # requires: # - initial-setup From 26aaf19b72df14742abfd724a693e51e61b7e4dc Mon Sep 17 00:00:00 2001 From: daler Date: Fri, 2 Jun 2023 10:51:04 -0400 Subject: [PATCH 12/32] ensure run_test.sh is copied over to deploy location for variant-calling --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 064323883..82bf4e068 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -139,6 +139,7 @@ variables: cp workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq cp workflows/colocalization/run_test.sh $DEPLOY/workflows/references cp workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization + cp workflows/variant-calling/run_test.sh $DEPLOY/workflows/variant-calling mkdir $DEPLOY/ci mkdir $DEPLOY/test cp test/lcdb-wf-test $DEPLOY/test From 25e6a611d4d850dea6e49eb008ada2cd94a9ac96 Mon Sep 17 00:00:00 2001 From: daler Date: Fri, 2 Jun 2023 10:55:55 -0400 Subject: [PATCH 13/32] fix comments in ci config --- .circleci/config.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 82bf4e068..68e2ed78a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -292,8 +292,7 @@ variables: # -------------------------------------------------------------------------- - # REVIEW: not RNA-seq - # Standard RNA-seq workflow + # Variant-calling workflow variantcalling-step: &variantcalling-step run: name: variantcalling workflow From 29c3c82f0d04122aaa1661fa8d61e18ea6f6452c Mon Sep 17 00:00:00 2001 From: daler Date: Fri, 2 Jun 2023 11:04:54 -0400 Subject: [PATCH 14/32] try resolving path --- test/lcdb-wf-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test index f215433d5..d7ce97dc8 100755 --- a/test/lcdb-wf-test +++ b/test/lcdb-wf-test @@ -278,7 +278,7 @@ class Runner(object): print(f"downloading {url}") if dest is None: dest = fn - dest = Path(dest) + dest = Path(dest).resolve() dest.parent.mkdir(parents=True, exist_ok=True) sp.run( f"wget -q -O- {url} > {dest}", shell=True, check=True, cwd=TOPLEVEL From 6a1da2bfe3ce43de1c5b8755894367347f5a4cfa Mon Sep 17 00:00:00 2001 From: daler Date: Fri, 2 Jun 2023 11:20:33 -0400 Subject: [PATCH 15/32] support deploying variant-calling workflow --- deploy.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/deploy.py b/deploy.py index 51b9e0b59..cdeec6d11 100755 --- a/deploy.py +++ b/deploy.py @@ -87,6 +87,10 @@ def write_include_file(flavor=None): 'recursive-include workflows/chipseq/config *', 'include workflows/chipseq/chipseq_trackhub.py', ], + 'variant-calling': [ + 'include workflows/variant-calling/Snakefile', + 'recursive-include workflows/variant-calling/config *', + ], 'all': [ 'recursive-include wrappers *', 'recursive-include include *', @@ -104,6 +108,7 @@ def write_include_file(flavor=None): 'recursive-include workflows/external *', ] + } patterns = [] @@ -111,6 +116,8 @@ def write_include_file(flavor=None): patterns.extend(PATTERN_DICT['rnaseq']) if flavor is None or 'chipseq': patterns.extend(PATTERN_DICT['chipseq']) + if flavor is None or 'variant-calling': + patterns.extend(PATTERN_DICT['variant-calling']) if flavor is None or 'full': patterns.extend(PATTERN_DICT['full']) patterns.extend(PATTERN_DICT['all']) @@ -324,7 +331,7 @@ def build_envs(dest, conda_frontend="mamba"): ap.add_argument( "--flavor", default="full", - help="""Options are {0}. Default is full.""".format(['full', 'rnaseq', 'chipseq']), + help="""Options are {0}. Default is full.""".format(['full', 'rnaseq', 'chipseq', 'variant-calling']), ) ap.add_argument( "--dest", help="""Destination directory in which to copy files""", required=True From 79ee0f539d91c1653725680d6bc9ba2b0aee2494 Mon Sep 17 00:00:00 2001 From: daler Date: Fri, 2 Jun 2023 12:14:46 -0400 Subject: [PATCH 16/32] remove windows line-endings and unused unit --- workflows/variant-calling/config/units.tsv | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/workflows/variant-calling/config/units.tsv b/workflows/variant-calling/config/units.tsv index 3d6d261d8..d336d4be9 100644 --- a/workflows/variant-calling/config/units.tsv +++ b/workflows/variant-calling/config/units.tsv @@ -1,4 +1,3 @@ -sample unit platform fq1 fq2 -tumor 1 Illumina data/example_data/tumor_R1.fq.gz data/example_data/tumor_R2.fq.gz -tumor 2 Illumina data/example_data/test_unit_R1.fq.gz data/example_data/test_unit_R2.fq.gz -normal 1 Illumina data/example_data/normal_R1.fq.gz data/example_data/normal_R2.fq.gz +sample unit platform fq1 fq2 +tumor 1 Illumina data/example_data/tumor_R1.fq.gz data/example_data/tumor_R2.fq.gz +normal 1 Illumina data/example_data/normal_R1.fq.gz data/example_data/normal_R2.fq.gz From 8ad4d36831d4dbf6c674819226f75f17af34360e Mon Sep 17 00:00:00 2001 From: fridellsa Date: Fri, 2 Jun 2023 13:06:55 -0400 Subject: [PATCH 17/32] 'resource formatting, threads test settings, snpeff adjustments' --- workflows/variant-calling/Snakefile | 112 +++++++++++++++++----------- 1 file changed, 67 insertions(+), 45 deletions(-) diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile index e608dc7a8..4466406b6 100644 --- a/workflows/variant-calling/Snakefile +++ b/workflows/variant-calling/Snakefile @@ -57,10 +57,10 @@ checkpoint genome_index: rule fasta_dict: - threads: 2 + threads: 1 resources: mem_mb=gb(4), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] disk_mb=gb(4), runtime=autobump(60) input: @@ -158,10 +158,11 @@ rule mark_duplicates: and we don't want to mark them as temporary """ threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] resources: disk_mb=gb(40), mem_mb=gb(32), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(720) input: bam = "results/mapped/{sample}-{unit}.sorted.bam", @@ -195,9 +196,10 @@ rule mark_duplicates: if config["filtering"]['bqsr']: rule base_recalibrator: threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] resources: mem_mb=gb(8), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] disk_mb=gb(40), runtime=autobump(960) input: @@ -224,9 +226,10 @@ if config["filtering"]['bqsr']: rule apply_bqsr: threads: 8 + # threads: 1 # [ TEST SETTINGS -1 ] resources: mem_mb=gb(32), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] disk_mb=gb(40), runtime=autobump(960) input: @@ -322,9 +325,10 @@ rule call_variants: resources: disk_mb=gb(16), mem_mb=gb(40), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(hours=8) threads: 8 + # threads: 1 # [ TEST SETTINGS -1 ] params: extra=get_call_variants_params, pcr='--pcr-indel-model ' + config['processing']['pcr'] @@ -350,10 +354,11 @@ rule call_variants: rule combine_calls: threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] resources: disk_mb=gb(10), mem_mb=gb(4), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(720) input: ref=reference, @@ -374,10 +379,11 @@ rule combine_calls: rule genotype_variants: threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] resources: disk_mb=gb(10), mem_mb=gb(8), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(480) input: ref=reference, @@ -399,10 +405,11 @@ rule genotype_variants: rule merge_variants: threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] resources: disk_mb=gb(10), mem_mb=gb(8), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(480) input: vcfs=lambda w: expand( @@ -428,8 +435,8 @@ rule merge_variants: rule tabix_variants: threads: 2 resources: - disk_mb = gb(2), - mem_mb = gb(2), + disk_mb=gb(2), + mem_mb=gb(2), runtime=autobump(30) input: vcf="{prefix}.vcf.gz", @@ -443,10 +450,11 @@ rule tabix_variants: rule select_calls: threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] resources: disk_mb=gb(10), mem_mb=gb(4), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(480) input: ref=reference, @@ -471,10 +479,11 @@ rule select_calls: rule hard_filter_calls: threads: 4 + # threads: 1 # [ TEST SETTINGS -1 ] resources: disk_mb=gb(10), mem_mb=gb(4), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(480) input: ref=reference, @@ -504,7 +513,7 @@ rule merge_calls: resources: disk_mb=gb(10), mem_mb=gb(8), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(480) input: vcfs=expand( @@ -514,7 +523,7 @@ rule merge_calls: output: vcf=temp("results/filtered/all.final.vcf.gz"), log: - "logs/merge-filtered.log", + "logs/merge-filtered.log", run: inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs) java_opts = set_java_opts(resources) @@ -533,8 +542,8 @@ rule norm: Split multiallielic variants into multiple biallelic ones. """ resources: - mem_mb = gb(16), - runtime = autobump(120) + mem_mb=gb(16), + runtime=autobump(120) input: ref=reference, vcf="results/filtered/all.final.vcf.gz" @@ -592,8 +601,8 @@ rule samtools_stats: Run samtools stats """ resources: - mem_mb = gb(16), - runtime = autobump(120) + mem_mb=gb(16), + runtime=autobump(120) input: get_sample_unit_bams output: @@ -652,7 +661,7 @@ rule snpeff: resources: disk_mb=gb(20), mem_mb=gb(16), - # mem_mb = gb(5), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(120) input: vcf='results/filtered/all.normed.vcf.gz', @@ -664,9 +673,11 @@ rule snpeff: params: annotations = config['snpeff']['annotations'], gen = config['snpeff']['genome'] + # threads: 2 # [ TEST SETTINGS ] run: java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024)) shell( + # 'mkdir -p $(dirname {output.ann}) && ' # [ TEST SETTINGS ] "snpEff {java_opts} " "-o vcf " "-csvStats {output.stats} " @@ -698,10 +709,10 @@ rule mutect2: Use Mutect2 to call variants on individual samples, one per contig """ resources: - disk_mb = gb(40), - mem_mb = gb(32), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] - runtime = autobump(720) + disk_mb=gb(40), + mem_mb=gb(32), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(720) input: unpack(input_for_somatic) output: @@ -718,6 +729,7 @@ rule mutect2: ), extra=get_call_variants_params, pcr='--pcr-indel-model ' + config['processing']['pcr'] + # threads: 1 # [ TEST SETTINGS ] run: java_opts = set_java_opts(resources) normals = " ".join("-I {} ".format(n) for n in input.normals) @@ -745,16 +757,17 @@ rule lrom: in the orientation bias mixture model filter """ resources: - disk_mb = gb(20), - mem_mb = gb(32), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] - runtime = autobump(120) + disk_mb=gb(20), + mem_mb=gb(32), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(120) input: orientation=lambda w: expand('results/lrom/{contig}_{{comp}}.tar.gz', contig=get_contigs()) output: lrom='results/lrom/artifact-prior-{comp}.tar.gz' log: 'logs/lrom_{comp}.log' + # threads: 1 # [ TEST SETTINGS ] run: java_opts = set_java_opts(resources) def get_format_lrom(): @@ -774,10 +787,10 @@ rule merge_mutect2_variants: After individual contigs are called via mutect2, we merge them together here. """ resources: - disk_mb = gb(20), - mem_mb = gb(32), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] - runtime = autobump(120) + disk_mb=gb(20), + mem_mb=gb(32), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(120) input: vcfs=lambda w: expand( "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz", contig=get_contigs() @@ -786,6 +799,7 @@ rule merge_mutect2_variants: temp("results/somatic/merged.{comp}.vcf.gz") log: "logs/merge_mutect2.{comp}.log", + # threads: 1 # [ TEST SETTINGS ] run: inputs = " ".join("-INPUT {}".format(f) for f in input.vcfs) java_opts = set_java_opts(resources) @@ -804,10 +818,10 @@ rule merge_mutect2_stats: Just like merging VCFs for Mutect2, we also need to merge stats for filtering. """ resources: - disk_mb = gb(20), - mem_mb = gb(16), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] - runtime = autobump(120) + disk_mb=gb(20), + mem_mb=gb(16), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(120) input: stats=lambda w: expand( "results/mutect2_called/raw.{{comp}}.{contig}.vcf.gz.stats", contig=get_contigs() @@ -816,6 +830,7 @@ rule merge_mutect2_stats: temp("results/somatic/merged.{comp}.vcf.gz.stats") log: "logs/merge_mutect2_stats.{comp}.log" + # threads: 1 # [ TEST SETTINGS ] run: java_opts = set_java_opts(resources) inputs = " ".join(" -stats {} ".format(f) for f in input.stats) @@ -833,10 +848,10 @@ rule filter_mutect2_calls: New versions of Mutect2 have optimized defaults for filtering; we can just use those. """ resources: - disk_mb = gb(20), - mem_mb = gb(16), - # mem_mb = gb(4), # [ TEST SETTINGS -1 ] - runtime = autobump(120) + disk_mb=gb(20), + mem_mb=gb(16), + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] + runtime=autobump(120) input: ref=reference, unfiltered="results/somatic/merged.{comp}.vcf.gz", @@ -846,6 +861,7 @@ rule filter_mutect2_calls: "results/somatic_filtered/filtered.{comp}.vcf.gz" log: "logs/{comp}.vcf.gz.log" + # threads: 1 # [ TEST SETTINGS ] run: java_opts = set_java_opts(resources) shell( @@ -864,8 +880,8 @@ rule mutect2_norm: Split multiallielic variants into multiple biallelic ones. """ resources: - mem_mb = gb(16), - runtime = autobump(120) + mem_mb=gb(16), + runtime=autobump(120) input: ref=reference, vcf="results/somatic_filtered/filtered.{comp}.vcf.gz" @@ -888,7 +904,7 @@ rule snpeff_cancer: resources: disk_mb=gb(20), mem_mb=gb(16), - # mem_mb = gb(5), # [ TEST SETTINGS -1 ] + # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(120) input: vcf='results/somatic_filtered/normed.{comp}.vcf.gz', @@ -900,11 +916,17 @@ rule snpeff_cancer: 'logs/cancer_snpeff_{comp}.log' params: snpeff_genome = config['snpeff']['genome'] + # threads: 2 # [ TEST SETTINGS ] run: java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024)) shell( - 'snpEff {java_opts} -v -o vcf -cancer -csvStats {output.stats} \ - -stats {output.html} {params.snpeff_genome} {input.vcf} | bcftools view -Oz > {output.vcf}' + # 'mkdir -p $(dirname {output.vcf}) && ' # [ TEST SETTINGS ] + 'snpEff {java_opts} ' + '-v -o vcf -cancer ' + '-csvStats {output.stats} ' + '-stats {output.html} ' + '{params.snpeff_genome} {input.vcf} ' + '| bcftools view -Oz > {output.vcf} 2> {log}' ) From c151dde17191bbf0331c452e8dc5ff0d3f2cba3b Mon Sep 17 00:00:00 2001 From: fridellsa Date: Fri, 2 Jun 2023 14:10:49 -0400 Subject: [PATCH 18/32] add option to provide path for variation dbs when using ref wf --- lib/helpers.smk | 13 +++++++++++-- workflows/variant-calling/config/config.yaml | 3 +++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/helpers.smk b/lib/helpers.smk index 4188b0654..de902d6e7 100644 --- a/lib/helpers.smk +++ b/lib/helpers.smk @@ -32,11 +32,20 @@ def preflight(): aln_index = multiext(os.path.splitext(aln)[0], ".amb", ".ann", ".bwt", ".pac", ".sa") indexed = refdict[config['ref']['organism'][config['ref']['faidx']['tag']]['faidx']] if config['ref']['variation']['dbnsfp']: - dbnsfp = refdict[config['ref']['organism']]['variation'][str(config['ref']['variation']['dbnsfp'] + '_' + config['ref']['genome']['build'])] + # The config can supply a path to a local file in the variation slots + if config['ref']['variation']['dbnsfp'].startswith('/'): + dbnsfp = config['ref']['variation']['dbnsfp'] + else: + dbnsfp = refdict[config['ref']['organism']]['variation'][str( + config['ref']['variation']['dbnsfp'] + '_' + config['ref']['genome']['build'] + )] else: dbnsfp = [] if config['ref']['variation']['known']: - known_sites = refdict[config['ref']['organism']][config['ref']['genome']['tag']][config['ref']['variation']['known']] + if config['ref']['variation']['known'].startswith('/'): + known_sites = config['ref']['variation']['known'] + else: + known_sites = refdict[config['ref']['organism']][config['ref']['genome']['tag']][config['ref']['variation']['known']] else: known_sites = [] else: diff --git a/workflows/variant-calling/config/config.yaml b/workflows/variant-calling/config/config.yaml index 50505c627..c59bda5f0 100644 --- a/workflows/variant-calling/config/config.yaml +++ b/workflows/variant-calling/config/config.yaml @@ -16,6 +16,9 @@ ref: index: 'faidx' tag: 'ensembl-104' variation: + # Fill these keys in with the name of the variation database that matches the value in the reference config + # Or alternatively, you can provide an ABSOLUTE path to these files locally (paths MUST start with '/') + # If this is the case, you should go edit the lcdb-wf references config to make sure these jobs are not run for no reason. known: 'known' dbnsfp: 'dbNSFPv4.4' # If you are providing your own references, include their paths here. From cd4a49bd92df9dc9fee02c730cc6574ec39ccd52 Mon Sep 17 00:00:00 2001 From: daler Date: Fri, 2 Jun 2023 14:11:23 -0400 Subject: [PATCH 19/32] create tarball artifact in the right place --- .circleci/config.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 68e2ed78a..91a68ea40 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -303,7 +303,7 @@ variables: $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow -n $DEPLOY/test/lcdb-wf-test variantcalling --run-workflow --use-conda -j2 - tar -zcf variantcalling.tar.gz workflows/variant-calling/results + tar -zcf /tmp/variantcalling.tar.gz workflows/variant-calling/results # -------------------------------------------------------------------------- # Syntax note: All of the steps above, with their "&step-name" labels, can be @@ -412,7 +412,8 @@ jobs: - *get-data - *variantcalling-step - store_artifacts: - path: /tmp/lcdb-wf-test/workflows/variant-calling/variantcalling.tar.gz + path: /tmp/variantcalling.tar.gz + destination: variantcalling.tar.gz rnaseq-misc: <<: *defaults From da845917a5d65892dda61d0b123784164497f8c4 Mon Sep 17 00:00:00 2001 From: daler Date: Fri, 2 Jun 2023 14:12:12 -0400 Subject: [PATCH 20/32] re-enable all tests (except for references) --- .circleci/config.yml | 69 ++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 91a68ea40..4c41a50d4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -495,44 +495,45 @@ workflows: test-suite: jobs: - initial-setup - # - pytest: - # requires: - # - initial-setup - # - chipseq: - # requires: - # - initial-setup - # - pytest - # - chipseq-misc: - # requires: - # - initial-setup - # - pytest - # - rnaseq: - # requires: - # - initial-setup - # - pytest - # - rnaseq-misc: - # requires: - # - initial-setup - # - pytest + - pytest: + requires: + - initial-setup + - chipseq: + requires: + - initial-setup + - pytest + - chipseq-misc: + requires: + - initial-setup + - pytest + - rnaseq: + requires: + - initial-setup + - pytest + - rnaseq-misc: + requires: + - initial-setup + - pytest # - references: # requires: # - initial-setup # - pytest - # - colocalization: - # requires: - # - initial-setup - # - pytest - # - build-docs: - # requires: - # - initial-setup - # - report-env: - # requires: - # - rnaseq - # - rnaseq-misc - # - chipseq - # - chipseq-misc - # - references - # - colocalization + - colocalization: + requires: + - initial-setup + - pytest - variantcalling: requires: - initial-setup + - build-docs: + requires: + - initial-setup + - report-env: + requires: + - rnaseq + - rnaseq-misc + - chipseq + - chipseq-misc + # - references + - colocalization + - variantcalling From 0192e0253e947040b921d964d98bd807ec5200cb Mon Sep 17 00:00:00 2001 From: fridellsa Date: Fri, 2 Jun 2023 15:35:29 -0400 Subject: [PATCH 21/32] hide government addresses and minor fixes --- docs/toc.rst | 1 + env.yml | 2 -- workflows/references/Snakefile | 2 +- workflows/references/config/config.yaml | 2 +- workflows/variant-calling/Snakefile | 2 -- 5 files changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/toc.rst b/docs/toc.rst index 1180c8cd7..0a1cbb5ff 100644 --- a/docs/toc.rst +++ b/docs/toc.rst @@ -13,6 +13,7 @@ Table of Contents rnaseq downstream-rnaseq chipseq + variant-calling integrative conda tests diff --git a/env.yml b/env.yml index ee82a16ed..5f31f28fa 100644 --- a/env.yml +++ b/env.yml @@ -1,4 +1,3 @@ -name: /gpfs/gsfs10/users/NICHD-core0/test/fridellsa/v1.10-lcdbwf-vc/lcdb-wf/env channels: - conda-forge - bioconda @@ -365,4 +364,3 @@ dependencies: - zlib=1.2.13 - zstandard=0.19.0 - zstd=1.5.2 -prefix: /gpfs/gsfs10/users/NICHD-core0/test/fridellsa/v1.10-lcdbwf-vc/lcdb-wf/env diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index 35583251c..de7328f1f 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -529,7 +529,7 @@ rule known_variation: # '''sort -S 50% --parallel=12 all_chrs_filtered -k8,8 -k9,9n > all_chrs_filtered_sorted && ''' # '''cat h all_chrs_filtered_sorted > all_chrs_filtered_sorted_header && ''' # '''bgzip -c all_chrs_filtered_sorted_header > {output}) && ''' -# '''tabix -s 1 -b 2 -e 2 {output} ''' +# '''tabix -s 8 -b 9 -e 9 {output} ''' # ) # if build == 'GRCh38': # with tempfile.TemporaryDirectory() as tmpdir: diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml index 5f8f2c664..bd9bd9392 100644 --- a/workflows/references/config/config.yaml +++ b/workflows/references/config/config.yaml @@ -1,4 +1,4 @@ -references_dir: '/data/NICHD-core0/references' +references_dir: 'references' # See the reference config files in the top level of the repo, diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile index 4466406b6..8b6a61c8e 100644 --- a/workflows/variant-calling/Snakefile +++ b/workflows/variant-calling/Snakefile @@ -677,7 +677,6 @@ rule snpeff: run: java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024)) shell( - # 'mkdir -p $(dirname {output.ann}) && ' # [ TEST SETTINGS ] "snpEff {java_opts} " "-o vcf " "-csvStats {output.stats} " @@ -920,7 +919,6 @@ rule snpeff_cancer: run: java_opts = '''"-Xmx{}g"'''.format(int(resources.mem_mb * 0.75 /1024)) shell( - # 'mkdir -p $(dirname {output.vcf}) && ' # [ TEST SETTINGS ] 'snpEff {java_opts} ' '-v -o vcf -cancer ' '-csvStats {output.stats} ' From 7a9fb721cc78fbfddf486c0cd764022810a3d921 Mon Sep 17 00:00:00 2001 From: fridellsa Date: Mon, 5 Jun 2023 11:00:01 -0400 Subject: [PATCH 22/32] Adding docs and tweaking pcr-indel-model arg in workflow --- workflows/variant-calling/Snakefile | 14 +++++++++++--- workflows/variant-calling/config/config.yaml | 3 ++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile index 8b6a61c8e..f570c1f5f 100644 --- a/workflows/variant-calling/Snakefile +++ b/workflows/variant-calling/Snakefile @@ -331,8 +331,12 @@ rule call_variants: # threads: 1 # [ TEST SETTINGS -1 ] params: extra=get_call_variants_params, - pcr='--pcr-indel-model ' + config['processing']['pcr'] - run: + pcr=( + '--pcr-indel-model ' + config['processing']['pcr'] + if config['processing']['pcr'] + else '' + ) + run: java_opts = set_java_opts(resources) known = input.known if known: @@ -727,7 +731,11 @@ rule mutect2: else [] ), extra=get_call_variants_params, - pcr='--pcr-indel-model ' + config['processing']['pcr'] + pcr=( + '--pcr-indel-model ' + config['processing']['pcr'] + if config['processing']['pcr'] + else '' + ) # threads: 1 # [ TEST SETTINGS ] run: java_opts = set_java_opts(resources) diff --git a/workflows/variant-calling/config/config.yaml b/workflows/variant-calling/config/config.yaml index c59bda5f0..3b2ce5305 100644 --- a/workflows/variant-calling/config/config.yaml +++ b/workflows/variant-calling/config/config.yaml @@ -32,7 +32,8 @@ processing: remove-duplicates: true remove-mitochondrial: true # See https://gatk.broadinstitute.org/hc/en-us/articles/360036465912-HaplotypeCaller#--pcr-indel-model for pcr - pcr: "NONE" + # If you know there was no PCR used to generate your sequencing data, set this to NONE + pcr: # Point to a bed file, e.g. captured regions restrict-regions: #'references/exons_subset.bed' From 34f34869acb1811ad909c6ca17bbefe9053ef55c Mon Sep 17 00:00:00 2001 From: fridellsa Date: Mon, 5 Jun 2023 11:56:07 -0400 Subject: [PATCH 23/32] fixed syntax err --- workflows/variant-calling/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile index f570c1f5f..0cc4be94f 100644 --- a/workflows/variant-calling/Snakefile +++ b/workflows/variant-calling/Snakefile @@ -336,7 +336,7 @@ rule call_variants: if config['processing']['pcr'] else '' ) - run: + run: java_opts = set_java_opts(resources) known = input.known if known: From 3574c69ab0b48673d22c18de30b97a98c8b0183a Mon Sep 17 00:00:00 2001 From: daler Date: Tue, 13 Jun 2023 20:17:30 -0400 Subject: [PATCH 24/32] fix known_variation --- workflows/references/Snakefile | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index de7328f1f..0bb6f00a0 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -422,7 +422,6 @@ rule known_variation: #fai='{references_dir}/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai' # can't do it this way since the {tag} wildcard is not congruous between input and output fai = lambda w: checkpoints.genome_index.get(**w).output[0] - output: protected('{references_dir}/{organism}/{tag}/known/{organism}_{tag}.vcf.gz') log: @@ -476,15 +475,25 @@ rule known_variation: for ext in ["vcf.gz", "vcf.gz.csi"] ] names=[os.path.basename(url) for url in urls if url.endswith(".gz")] + + # Compose a command that downloads all vcf.gz files for all contigs in use gather = "curl {urls}".format(urls=" ".join(map("-O {}".format, urls))) + + # Absolute paths needed because we'll be downloading in a temp dir + fai = os.path.abspath(input.fai) + log = os.path.abspath(str(log)) with tempfile.TemporaryDirectory() as tmpdir: if input.get("fai"): shell( - "(cd {tmpdir}; {gather} && " - "bcftools concat -Oz --naive-force {names} > concat.vcf.gz && " - "bcftools reheader --fai {input.fai} concat.vcf.gz " - "> {output}) && " - "tabix -p vcf {output} 2> {log} " + "( cd {tmpdir}; " + "{gather} && " + "bcftools concat -Oz --naive-force {names} " + "> concat.vcf.gz 2> {log} )" + ) + shell( + "bcftools reheader --fai {fai} {tmpdir}/concat.vcf.gz " + "> {output} 2>> {log} && " + "tabix -p vcf {output} 2>> {log} " ) #if config['references']['human'].get('variation'): From d02110eba8de107ce2af734474fc78b48bca414d Mon Sep 17 00:00:00 2001 From: daler Date: Tue, 13 Jun 2023 20:18:09 -0400 Subject: [PATCH 25/32] re-enable references test --- .circleci/config.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4c41a50d4..5d423b029 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -514,10 +514,10 @@ workflows: requires: - initial-setup - pytest - # - references: - # requires: - # - initial-setup - # - pytest + - references: + requires: + - initial-setup + - pytest - colocalization: requires: - initial-setup From 51b458653e1fa6a65a072296bf4ef7b07fc1a3cd Mon Sep 17 00:00:00 2001 From: daler Date: Thu, 15 Jun 2023 15:31:55 -0400 Subject: [PATCH 26/32] reorganize test config for variant calling --- .../test_configs}/variant-calling.yaml | 4 ++-- workflows/references/config/config.yaml | 8 -------- 2 files changed, 2 insertions(+), 10 deletions(-) rename {include/reference_configs => test/test_configs}/variant-calling.yaml (90%) delete mode 100644 workflows/references/config/config.yaml diff --git a/include/reference_configs/variant-calling.yaml b/test/test_configs/variant-calling.yaml similarity index 90% rename from include/reference_configs/variant-calling.yaml rename to test/test_configs/variant-calling.yaml index 6ad0f9b48..d768eb05e 100644 --- a/include/reference_configs/variant-calling.yaml +++ b/test/test_configs/variant-calling.yaml @@ -1,3 +1,4 @@ +references_dir: "references" references: human: ensembl-104: @@ -6,13 +7,12 @@ references: release: 104 species: 'homo_sapiens' genome: - url: 'ftp://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' + url: 'https://github.com/lcdb/lcdb-wf-variant-calling-test-data/raw/master/data/GRCh38.6.20.fa.gz' # URL format is 'ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species_capitalized}.{build}.{datatype}.{assembly}.{suffix}' # When using GRCh37, branch changes to "grch37/release-{release}" # always use primary_assembly for human, NEVER use top_level for assembly for human indexes: - 'bwa' - - 'faidx' known: # You can download structural_variations, somatic, or "all" which corresponds to germline known variation for all chromosomes type: 'all' diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml deleted file mode 100644 index bd9bd9392..000000000 --- a/workflows/references/config/config.yaml +++ /dev/null @@ -1,8 +0,0 @@ -references_dir: 'references' - - -# See the reference config files in the top level of the repo, -# include/reference_configs, for inspiration for more species. -include_references: - # - '../../include/reference_configs/test.yaml' - - '../../include/reference_configs/variant-calling.yaml' From 0ffe62decfb6a862ac4e3a95c6d986fb8674fe32 Mon Sep 17 00:00:00 2001 From: daler Date: Thu, 15 Jun 2023 15:32:47 -0400 Subject: [PATCH 27/32] treat fai just like chromsizes --- lib/common.py | 19 +++++++++++++------ workflows/references/Snakefile | 8 +++----- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/lib/common.py b/lib/common.py index 653dc2f6c..7ed03cc73 100644 --- a/lib/common.py +++ b/lib/common.py @@ -419,9 +419,7 @@ def references_dict(config): 'bowtie2': aligners.bowtie2_index_from_prefix('')[0], 'hisat2': aligners.hisat2_index_from_prefix('')[0], 'star': '/Genome', - # Add BWA and samtools faidx indices 'bwa': aligners.bwa_index_from_prefix('')[0], - 'faidx': '.fai', # Notes on salmon indexing: # - pre-1.0 versions had hash.bin @@ -462,10 +460,9 @@ def references_dict(config): d[organism] = {} for tag in merged_references[organism].keys(): e = {} - # add support for variation databases if tag == 'variation': - # get the variation databases - # they should be the the keys of a dictionary containing a URL and postprocess block + # variation databases should be the the keys of a dictionary + # containing a URL and postprocess block for type_ in merged_references[organism][tag].keys(): ext = '.vcf.gz' if type_ == 'dbnsfp': @@ -567,7 +564,8 @@ def references_dict(config): .format(**locals()) ) - # Only makes sense to have chromsizes for genome fasta, not transcriptome. + # Only makes sense to have chromsizes and faidx for genome + # fasta, not transcriptome. if type_ == 'genome': e['chromsizes'] = ( '{references_dir}/' @@ -576,6 +574,15 @@ def references_dict(config): '{type_}/' '{organism}_{tag}.chromsizes'.format(**locals()) ) + e['faidx'] = ( + '{references_dir}/' + '{organism}/' + '{tag}/' + '{type_}/' + '{organism}_{tag}.fai'.format(**locals()) + ) + + d[organism][tag] = e return d, conversion_kwargs diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index 0bb6f00a0..291587ffd 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -398,9 +398,9 @@ checkpoint genome_index: input: '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' output: - protected('{references_dir}/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai') + protected('{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fai') log: - '{references_dir}/logs/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai.log' + '{references_dir}/logs/{organism}/{tag}/genome/{organism}_{tag}.fai.log' resources: runtime=hours(1), mem_mb=gb(4) @@ -419,9 +419,7 @@ rule known_variation: create a known variation vcf """ input: - #fai='{references_dir}/{organism}/{tag}/genome/faidx/{organism}_{tag}.fai' - # can't do it this way since the {tag} wildcard is not congruous between input and output - fai = lambda w: checkpoints.genome_index.get(**w).output[0] + fai='{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fai' output: protected('{references_dir}/{organism}/{tag}/known/{organism}_{tag}.vcf.gz') log: From b9e7a6fcf817045dbc3fc5f13171549e6c097a6c Mon Sep 17 00:00:00 2001 From: daler Date: Thu, 15 Jun 2023 15:33:14 -0400 Subject: [PATCH 28/32] clean up known variation rule - add info about other species - point out human-specific parts - improve comments - more canonical code --- workflows/references/Snakefile | 84 +++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 38 deletions(-) diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index 291587ffd..0f956d855 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -429,28 +429,38 @@ rule known_variation: mem_mb=gb(16), disk_mb=gb(32) run: - # Get the configuration options in the metadata chunk using wildcards release = int(config['references'][wildcards.organism][wildcards.tag]['metadata']['release']) species = config['references'][wildcards.organism][wildcards.tag]['metadata']['species'] build = config['references'][wildcards.organism][wildcards.tag]['metadata']['build'] typ = config['references'][wildcards.organism][wildcards.tag]['known']['type'] - def get_contigs(): - with open(input[0], 'r') as fai: - ser = pd.read_table(fai, header=None, usecols=[0], dtype=str) - ser = ser.squeeze() - ser = ser[ser.apply(lambda x: len(x)) <= 2] - return ser - contigs = get_contigs() - branch="" + + # ---------------------------------------------------------------------- + # NOTE: species-specific configuration may be required + # ---------------------------------------------------------------------- + # + # Ensembl has many species available, but the code below is designed to + # work for human. + # + # For available species, see https://ftp.ensembl.org/pub/release-109/variation/vcf/ + # + + # Human-specific patching to deal with GRCh37 filenames and directory + # structure on Ensembl FTP + branch = "" if release >= 81 and build == "GRCh37": branch="grch37/" if typ == "all": + suffixes = [""] + + # Starting in release 93, human germline VCFs are split by chrom if species == "homo_sapiens" and release >= 93: - suffixes = [ - "-chr{}".format(chrom) for chrom in contigs - ] - else: - suffixes = [""] + ser = ( + pd.read_table(input.fai, header=None, usecols=[0], dtype=str) + .squeeze() + ) + contigs = ser[ser.apply(lambda x: len(x)) <= 2] + suffixes = expand("-chr{chrom}", chrom=contigs) + elif typ == "somatic": suffixes=["_somatic"] elif typ == "structural_variations": @@ -459,39 +469,37 @@ rule known_variation: release = int(release) build = build typ = typ - species_filename=species if release >= 91 else species.capitalize() - urls=[ - "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.{ext}".format( - release=release, - species=species, - suffix=suffix, - species_filename=species_filename, - branch=branch, - ext=ext, - ) - for suffix in suffixes - for ext in ["vcf.gz", "vcf.gz.csi"] - ] - names=[os.path.basename(url) for url in urls if url.endswith(".gz")] - - # Compose a command that downloads all vcf.gz files for all contigs in use + + # Prior to release 91, species names started with a capital letter + species_filename = species if release >= 91 else species.capitalize() + + urls = expand( + "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.{ext}", + release=release, + species=species, + suffix=suffixes, + species_filename=species_filename, + branch=branch, + ext=["vcf.gz", "vcf.gz.csi"] + ) + vcfs = [os.path.basename(url) for url in urls if url.endswith(".gz")] + + # Compose a command that downloads all vcf.gz files (possibly one for + # each contig in human Ensembl releases >=93) gather = "curl {urls}".format(urls=" ".join(map("-O {}".format, urls))) - # Absolute paths needed because we'll be downloading in a temp dir + # We'll be downloading in a temp dir, so get absolute paths to make things easier fai = os.path.abspath(input.fai) log = os.path.abspath(str(log)) with tempfile.TemporaryDirectory() as tmpdir: if input.get("fai"): shell( - "( cd {tmpdir}; " - "{gather} && " - "bcftools concat -Oz --naive-force {names} " - "> concat.vcf.gz 2> {log} )" + "cd {tmpdir}; {gather} 2> {log} " + "&& bcftools concat -Oz --naive-force {vcfs} > concat.vcf.gz 2>> {log}" ) shell( - "bcftools reheader --fai {fai} {tmpdir}/concat.vcf.gz " - "> {output} 2>> {log} && " - "tabix -p vcf {output} 2>> {log} " + "bcftools reheader --fai {fai} {tmpdir}/concat.vcf.gz> {output} 2>> {log} " + "&& tabix -p vcf {output} 2>> {log} " ) #if config['references']['human'].get('variation'): From 0b5a9b5d160a06ceba4b4d7b8f0c2d92a0d43586 Mon Sep 17 00:00:00 2001 From: daler Date: Thu, 15 Jun 2023 16:18:05 -0400 Subject: [PATCH 29/32] add back in the original references config --- workflows/references/config/config.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 workflows/references/config/config.yaml diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml new file mode 100644 index 000000000..49618dcd0 --- /dev/null +++ b/workflows/references/config/config.yaml @@ -0,0 +1,6 @@ +references_dir: 'references_dir' + +# See the reference config files in the top level of the repo, +# include/reference_configs, for inspiration for more species. +include_references: + - '../../include/reference_configs/test.yaml' From fd3c58e5e2441b719b35e3e0fd5a6fee7f908914 Mon Sep 17 00:00:00 2001 From: daler Date: Thu, 15 Jun 2023 16:18:22 -0400 Subject: [PATCH 30/32] split variant-calling and non-variant-calling configs --- .circleci/config.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5d423b029..e7ca4c5e5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -226,8 +226,14 @@ variables: command: | source /opt/mambaforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV + # RNA-seq/ChIP-seq references $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -r -k --orig $ORIG + # Variant-calling references + $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=$ORIG/test/test_configs/variant-calling.yaml -j2 -p -r -k --orig $ORIG + + + # -------------------------------------------------------------------------- # Standard RNA-seq workflow rnaseq-step: &rnaseq-step From c519f408ce3bb01978df736ff86811520d5a89b8 Mon Sep 17 00:00:00 2001 From: Emma Smith Date: Wed, 22 Jan 2025 09:31:09 -0500 Subject: [PATCH 31/32] match string format across rules --- workflows/variant-calling/Snakefile | 49 ++++++++++++++--------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile index 0cc4be94f..9a85c2d37 100644 --- a/workflows/variant-calling/Snakefile +++ b/workflows/variant-calling/Snakefile @@ -70,9 +70,9 @@ rule fasta_dict: run: java_opts = set_java_opts(resources) shell( - 'picard CreateSequenceDictionary {java_opts} \ - -R {input.ref} \ - -O {output} &> {log} ' + 'picard CreateSequenceDictionary {java_opts} ' + '-R {input.ref} ' + '-O {output} &> {log} ' ) @@ -95,7 +95,7 @@ if not aln_index: params: "bwtsw " shell: - "bwa index -a {params}" + "bwa index -a {params} " "{input} " " &> {log}" @@ -216,11 +216,11 @@ if config["filtering"]['bqsr']: run: java_opts = set_java_opts(resources) shell( - 'gatk --java-options {java_opts} BaseRecalibrator \ - -R {input.ref} \ - -I {input.bam} \ - -O {output.recal_table} \ - --known-sites {input.known} 2> {log}' + 'gatk --java-options {java_opts} BaseRecalibrator ' + '-R {input.ref} ' + '-I {input.bam} ' + '-O {output.recal_table} ' + '--known-sites {input.known} 2> {log}' ) @@ -245,11 +245,11 @@ if config["filtering"]['bqsr']: run: java_opts = set_java_opts(resources) shell( - 'gatk --java-options {java_opts} ApplyBQSR \ - -R {input.ref} \ - -I {input.bam} \ - --bqsr-recal-file {input.recal_table} \ - -O {output.bam} 2> {log}' + 'gatk --java-options {java_opts} ApplyBQSR ' + '-R {input.ref} ' + '-I {input.bam} ' + '--bqsr-recal-file {input.recal_table} ' + '-O {output.bam} 2> {log}' ) @@ -347,12 +347,12 @@ rule call_variants: bams = [bams] bams = list(map("-I {}".format, bams)) shell( - 'gatk --java-options {java_opts} HaplotypeCaller {regions} \ - -R {input.ref} \ - {bams} \ - -ERC GVCF \ - {params.pcr} \ - -O {output.gvcf} {known} 2> {log}' + 'gatk --java-options {java_opts} HaplotypeCaller {regions} ' + '-R {input.ref} ' + '{bams} ' + '-ERC GVCF ' + '{params.pcr} ' + '-O {output.gvcf} {known} 2> {log} ' ) @@ -374,10 +374,10 @@ rule combine_calls: java_opts = set_java_opts(resources) gvcfs=list(map("-V {}".format, input.gvcfs)) shell( - 'gatk --java-options {java_opts} CombineGVCFs \ - {gvcfs} \ - -R {input.ref} \ - -O {output.gvcf} 2> {log} ' + 'gatk --java-options {java_opts} CombineGVCFs ' + '{gvcfs} ' + '-R {input.ref} ' + '-O {output.gvcf} 2> {log} ' ) @@ -935,5 +935,4 @@ rule snpeff_cancer: '| bcftools view -Oz > {output.vcf} 2> {log}' ) - # vim: ft=python From 7e952ec9e7aa803c45fc292854856910053402e2 Mon Sep 17 00:00:00 2001 From: Emma Smith Date: Wed, 22 Jan 2025 11:58:00 -0500 Subject: [PATCH 32/32] make snpeff rules require dictionary of input file --- lib/helpers.smk | 41 +++++++++++++++++++++++++++++ workflows/variant-calling/Snakefile | 4 +-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/lib/helpers.smk b/lib/helpers.smk index de902d6e7..8c146ba80 100644 --- a/lib/helpers.smk +++ b/lib/helpers.smk @@ -343,4 +343,45 @@ def get_bed_nomenclature(input): return nom +def snpeff_input(wildcards): + """ + Ensure all correct input files for snpEff rule are available + """ + dbnsfp = ( + config['ref']['paths']['dbnsfp'] + if config['ref']['paths']['dbnsfp'] + else [] + ) + dbnsfp_tbi = dbnsfp + '.tbi' + vcf = 'results/filtered/all.normed.vcf.gz' + + # make dictionary containing the required snpEff input files + d = dict( + dbnsfp=dbnsfp, + dbnsfp_tbi=dbnsfp_tbi, + vcf=vcf, + ) + return d + + +def snpeff_cancer_input(wildcards): + """ + Ensure all correct input files for snpEff cancer rule are available + """ + dbnsfp = ( + config['ref']['paths']['dbnsfp'] + if config['ref']['paths']['dbnsfp'] + else [] + ) + dbnsfp_tbi = dbnsfp + '.tbi' + vcf = 'results/somatic_filtered/normed.{comp}.vcf.gz'.format(comp = wildcards.comp) + + # make dictionary containing the required snpEff cancer input files + d = dict( + dbnsfp=dbnsfp, + dbnsfp_tbi=dbnsfp_tbi, + vcf=vcf, + ) + return d + # vim: ft=python diff --git a/workflows/variant-calling/Snakefile b/workflows/variant-calling/Snakefile index 9a85c2d37..c6850a37a 100644 --- a/workflows/variant-calling/Snakefile +++ b/workflows/variant-calling/Snakefile @@ -668,7 +668,7 @@ rule snpeff: # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(120) input: - vcf='results/filtered/all.normed.vcf.gz', + unpack(snpeff_input) log: 'logs/snpeff.log' output: ann='results/annotated/ann.vcf.gz', @@ -914,7 +914,7 @@ rule snpeff_cancer: # mem_mb=gb(4), # [ TEST SETTINGS -1 ] runtime=autobump(120) input: - vcf='results/somatic_filtered/normed.{comp}.vcf.gz', + unpack(snpeff_cancer_input), output: vcf='results/mutect2_annotated/snpeff.{comp}.vcf.gz', stats='results/qc/snpEff_{comp}_summary.csv',