diff --git a/.circleci/config.yml b/.circleci/config.yml index da38e0592..16e5b5f06 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,7 +5,7 @@ variables: # default settings for all steps defaults: &defaults docker: - - image: ubuntu:20.04 + - image: ubuntu:latest # -------------------------------------------------------------------------- # The caching dramatically speeds up testing time, because we can do the @@ -28,7 +28,7 @@ variables: save_cache: key: v5-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }} paths: - - /opt/mambaforge + - /opt/miniforge # this file is created by sra-tools upon installation by conda, and so # needs to be included in the cache otherwise fastq-dump thinks it's @@ -48,6 +48,7 @@ variables: name: Set path command: | # x11-utils required to avoid R::png() segfaulting + export DEBIAN_FRONTEND=noninteractive apt update && apt install -y \ curl \ git \ @@ -73,7 +74,7 @@ variables: # Note that if we don't escape \$PATH, we'll be stuck with the exact # PATH defined here, which will break anything needing conda envs. - echo "export PATH=\$PATH:/opt/mambaforge/bin" >> $BASH_ENV + echo "export PATH=\$PATH:/opt/miniforge/bin" >> $BASH_ENV source $BASH_ENV @@ -85,28 +86,16 @@ variables: command: | source $BASH_ENV echo $PATH - # /opt/mambaforge will only exist if there was a cache restore; otherwise we'll make it here. + # /opt/miniforge will only exist if there was a cache restore; otherwise we'll make it here. # - # Use mambaforge which comes with mamba. - if [ ! -e /opt/mambaforge ]; then - curl -L https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh > mambaforge.sh - bash mambaforge.sh -b -p /opt/mambaforge - source "/opt/mambaforge/etc/profile.d/conda.sh" - source "/opt/mambaforge/etc/profile.d/mamba.sh" + if [ ! -e /opt/miniforge ]; then + curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" + bash Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/miniforge + source "/opt/miniforge/etc/profile.d/conda.sh" conda activate which conda - which mamba - mamba --version - - # Note that mambaforge doesn't come with the defaults channel, but - # we're adding it here at the beginning to simulate what most users - # probably have locally (and following the bioconda docs). Using - # strict channel priority means we should [theoretically] never - # pull packages from defaults because they all exist on - # conda-forge. - conda config --system --add channels defaults - + conda --version conda config --system --add channels bioconda conda config --system --add channels conda-forge conda config --system --set channel_priority strict @@ -115,10 +104,10 @@ variables: # https://docs.conda.io/projects/conda-build/en/latest/resources/link-scripts.html, # post-link scripts should not depend on any installed or # to-be-installed conda packages...but they do. - mamba install -n base r-base yq + conda install -n base r-base yq - time mamba env create -n $LCDBWF_ENV --file env.yml - time mamba env create -n $LCDBWF_ENV_R --file env-r.yml + time conda env create -n $LCDBWF_ENV --file env.yml + time conda env create -n $LCDBWF_ENV_R --file env-r.yml fi # -------------------------------------------------------------------------- @@ -127,7 +116,7 @@ variables: run: name: Download example data command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV conda info --envs conda config --show @@ -151,7 +140,7 @@ variables: cp $ORIG/workflows/rnaseq/run_test.sh $DEPLOY/workflows/rnaseq/run_test.sh cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh cp $ORIG/workflows/references/run_test.sh $DEPLOY/workflows/references/run_test.sh - cp $ORIG/workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization/run_test.sh + # cp $ORIG/workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization/run_test.sh mkdir $DEPLOY/ci mkdir $DEPLOY/test @@ -172,7 +161,7 @@ variables: run: name: Run pytest suite and testthat suite command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV # run unit tests and doctests for the modules in lib test/lcdb-wf-test unit_tests --pytest @@ -194,9 +183,9 @@ variables: name: chipseq workflow command: | cd $DEPLOY/workflows/chipseq - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p -r + $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p $DEPLOY/test/lcdb-wf-test chipseq --trackhub # -------------------------------------------------------------------------- @@ -208,10 +197,10 @@ variables: name: chipseq misc command: | cd $DEPLOY/workflows/chipseq - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - ./run_test.sh --use-conda -j2 -k -p -r \ + ./run_test.sh --use-conda -j2 -k -p \ --configfile $ORIG/test/test_configs/test_chipseq_regression.yaml \ --config sampletable=$ORIG/test/test_configs/chipseq_one_run.tsv \ merged_bigwigs="{}" \ @@ -231,16 +220,6 @@ variables: --until bed_to_bigbed fi - # -------------------------------------------------------------------------- - # Standard references workflow. - references-step: &references-step - run: - name: references workflow - command: | - source /opt/mambaforge/etc/profile.d/conda.sh - conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -r -k --orig $ORIG - # -------------------------------------------------------------------------- # Standard RNA-seq workflow rnaseq-step: &rnaseq-step @@ -248,10 +227,10 @@ variables: name: rnaseq workflow command: | cd $DEPLOY - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow -n - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p -r --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --trackhub --orig $ORIG @@ -276,19 +255,19 @@ variables: command: | ORIG=$(pwd) cd $DEPLOY - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV # Check the help for test/lcdb-wf-test to see what args these # provide; some of them use the --until argument to restrict the # rules that are run. Note the use of --orig $ORIG to use the test # configs from the original clone rather than the deployed directory. - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe -k -r -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe -k -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se -k -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass -k -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass -k -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe -k -p -j2 --use-conda --orig $ORIG @@ -299,9 +278,9 @@ variables: name: colocalization workflow command: | cd $DEPLOY/workflows/colocalization - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -r -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -p -j2 --use-conda --orig $ORIG # -------------------------------------------------------------------------- # Syntax note: All of the steps above, with their "&step-name" labels, can be @@ -410,23 +389,23 @@ jobs: - *get-data - *rnaseq-misc-step - colocalization: - <<: *defaults - steps: - - checkout - - *restore_cache - - *set-path - - *get-data - - *colocalization-step - - references: - <<: *defaults - steps: - - checkout - - *restore_cache - - *set-path - - *get-data - - *references-step + # colocalization: + # <<: *defaults + # steps: + # - checkout + # - *restore_cache + # - *set-path + # - *get-data + # - *colocalization-step + + # references: + # <<: *defaults + # steps: + # - checkout + # - *restore_cache + # - *set-path + # - *get-data + # - *references-step build-docs: <<: *defaults @@ -438,9 +417,9 @@ jobs: - run: name: Install sphinx command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate lcdb-wf-test - mamba install -y sphinx make yaml + conda install -y sphinx make yaml - run: name: OK for unknown github host command: mkdir -p ~/.ssh/ && echo -e "Host github.com\n\tStrictHostKeyChecking no\n" > ~/.ssh/config @@ -450,7 +429,7 @@ jobs: - run: name: Build and upload docs command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate lcdb-wf-test ci/build-docs.sh - store_artifacts: @@ -466,7 +445,7 @@ jobs: - run: name: Report environment command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda env export -n lcdb-wf-test > /tmp/env.yaml conda env export -n lcdb-wf-test-r > /tmp/env-r.yaml - store_artifacts: @@ -500,14 +479,14 @@ workflows: requires: - initial-setup - pytest - - references: - requires: - - initial-setup - - pytest - - colocalization: - requires: - - initial-setup - - pytest + # - references: + # requires: + # - initial-setup + # - pytest + # - colocalization: + # requires: + # - initial-setup + # - pytest - build-docs: requires: - initial-setup @@ -518,4 +497,4 @@ workflows: - chipseq - chipseq-misc - references - - colocalization + # - colocalization diff --git a/ci/preprocessor.py b/ci/preprocessor.py index 042bee332..1cd7e5dac 100644 --- a/ci/preprocessor.py +++ b/ci/preprocessor.py @@ -7,54 +7,16 @@ in production. Rather than require users edit files to remove those test-specific patterns, here we keep the test settings commented out and only un-comment when running tests. - -First, we look for any line that matches "# [test settings]" (case insensitive, -with optional surrounding spacing) and an optional signed integer. Any of these -would work: - - >>> assert matches('# [test settings]') - >>> assert matches('#[test settings]') - >>> assert matches('# [ test settings ]') - >>> assert matches('# [ test settings -1]') - >>> assert matches('# [ test settings +2]') - >>> assert matches('# [ TEST SETTINGS +2]') - >>> assert matches('# [ TeSt SeTTiNgS +2 ]') - -If a lines does not match, output it as-is. - -If a line matches, then uncomment it. Specifically, remove the first "#" in the -line; if it was followed by exactly one space, then remove that too. - -If a line matches and a signed integer was provided, then consider it -a relative location, and then comment-out the referred-to line. Example: - - >>> preprocess(''' - ... use this for production - ... # use this for tests # [test settings -1] - ... '''.splitlines(True)) - - # use this for production - use this for tests # [test settings -1] - - -If the matched special string creates the first "#" in the line, then do -nothing to that line but still respect the relative locations. Useful for just -commenting out nearby lines for tests: - - >>> preprocess(''' - ... # [TEST SETTINGS +1] - ... comment out for testing'''.splitlines(True)) - - # [TEST SETTINGS +1] - # comment out for testing """ + import re -regexp = re.compile(r'#\s?\[\s?test settings\s?(?P[-+]*\d)?\s*\]') +regexp = re.compile(r"#\s?\[\s?(enable|disable) for test\s?\]") -def matches(line): - return regexp.search(line.lower()) is not None + +def is_commented(line): + return line.strip().startswith("#") def comment_line(line): @@ -66,87 +28,75 @@ def comment_line(line): """ x = [] for i, character in enumerate(line): - if character == ' ': + if character == " ": x.append(character) else: break - x.append('# ') + x.append("# ") x.extend(line[i:]) - return ''.join(x) + return "".join(x) def uncomment_line(line): """ Removes the first instance of "#" from a line; if it was followed by - exactly one space then remove that too. + exactly one space then remove that too . . . UNLESS the *only* comment is the + special character that triggers this behavior, in which case we do nothing. >>> assert uncomment_line('# asdf') == 'asdf' >>> assert uncomment_line('#asdf') == 'asdf' >>> assert uncomment_line('# asdf # but this should be kept') == 'asdf # but this should be kept' >>> assert uncomment_line('# asdf') == ' asdf' >>> assert uncomment_line(' # asdf') == ' asdf' + >>> assert uncomment_line('do nothing') == 'do nothing' + >>> assert uncomment_line('do nothing # [disable for test]') == 'do nothing # [disable for test]' + >>> assert uncomment_line('#uncomment # [disable for test]') == 'uncomment # [disable for test]' """ - first = line.find('#') + first = line.find("#") - # If the first comment is the one that flag the line, then do nothing. + # If the first comment is the one that flagged the line, then do nothing. m = regexp.search(line.lower()) if m: if m.start() == first: return line - if line[first + 1] == ' ' and line[first + 2] != ' ': - pattern = '# ' + if line[first + 1] == " " and line[first + 2] != " ": + pattern = "# " else: - pattern = '#' - return line.replace(pattern, '', 1) + pattern = "#" + return line.replace(pattern, "", 1) def preprocess(lines): + result = [] if isinstance(lines, str): lines = [lines] - # These lists will keep track of whether a line should be changed. We need to - # create them ahead of time so that we can use relative indexing from line N to - # modify the state of lines N-1 or N+1 - uncomment = [False for i in range(len(lines))] - comment = [False for i in range(len(lines))] - - for i, line in enumerate(lines): + for line in lines: m = regexp.search(line.lower()) - if m: - # There as at least a "[ test settings ]", so remove comment - uncomment[i] = True - - # Figure out if there was also a relative location to uncomment, - # and keep track of it in the `comment` list. - rel = m.group('rel') - if rel is not None: - rel = int(rel) - comment[i + rel] = True + if not m: + result.append(line) + continue - result = [] - for (c, u, line) in zip(comment, uncomment, lines): - # E.g., in this situation, unclear what should happen: - # - # # [test settings] - # # [test settings -1] - # - if c and u: - raise ValueError("Line {0} is trying to be both commented and uncommented".format(line)) - if c: - result.append(comment_line(line)) - elif u: + action = m.group(1) + if action == "enable" and is_commented(line): result.append(uncomment_line(line)) + elif action == "disable" and not is_commented(line): + result.append(comment_line(line)) else: - result.append(line) - print(''.join(result)) + raise ValueError(f"Inconsistent commenting and action:\n{line}") + + print("".join(result)) if __name__ == "__main__": import argparse + ap = argparse.ArgumentParser(usage=__doc__) - ap.add_argument('infile', help='Input file to modify. Modified file printed to stdout.') + ap.add_argument( + "infile", help="Input file to modify. Modified file printed to stdout." + ) args = ap.parse_args() lines = open(args.infile).readlines() preprocess(lines) diff --git a/env.yml b/env.yml index 5b6567204..9bbc8a717 100644 --- a/env.yml +++ b/env.yml @@ -5,227 +5,220 @@ dependencies: - _libgcc_mutex=0.1 - _openmp_mutex=4.5 - _r-mutex=1.0.1 - - alsa-lib=1.2.3.2 - - amply=0.1.5 + - alabaster=1.0.0 + - alsa-lib=1.2.13 + - amply=0.1.6 + - annotated-types=0.7.0 - appdirs=1.4.4 - - argcomplete=3.0.8 - - argh=0.27.2 - - asttokens=2.2.1 - - attr=2.5.1 - - attrs=23.1.0 - - backcall=0.2.0 - - backports=1.0 - - backports.functools_lru_cache=1.6.4 - - bedtools=2.31.0 - - binutils_impl_linux-64=2.39 - - binutils_linux-64=2.39 - - biopython=1.81 - - boost-cpp=1.74.0 + - argcomplete=3.5.2 + - argh=0.31.3 + - argparse-dataclass=2.0.0 + - asttokens=3.0.0 + - attrs=24.3.0 + - babel=2.16.0 + - beautifulsoup4=4.12.3 + - bedtools=2.31.1 + - binutils_impl_linux-64=2.43 + - biopython=1.84 + - boost-cpp=1.85.0 - bowtie=1.3.1 - - bowtie2=2.5.1 - - brotli=1.0.9 - - brotli-bin=1.0.9 - - brotlipy=0.7.0 - - bwidget=1.9.14 - - bx-python=0.9.0 + - bowtie2=2.5.4 + - brotli=1.1.0 + - brotli-bin=1.1.0 + - brotli-python=1.1.0 + - bwidget=1.10.1 + - bx-python=0.13.0 - bzip2=1.0.8 - - c-ares=1.18.1 - - ca-certificates=2023.5.7 - - cairo=1.16.0 - - certifi=2023.5.7 - - cffi=1.15.1 - - charset-normalizer=3.1.0 - - click=8.1.3 - - coin-or-cbc=2.10.10 - - coin-or-cgl=0.60.7 - - coin-or-clp=1.17.8 - - coin-or-osi=0.108.8 - - coin-or-utils=2.11.9 - - coincbc=2.10.10 + - c-ares=1.34.4 + - ca-certificates=2024.12.14 + - cairo=1.18.2 + - certifi=2024.12.14 + - cffi=1.17.1 + - charset-normalizer=3.4.1 + - click=8.1.8 + - coin-or-cbc=2.10.12 + - coin-or-cgl=0.60.9 + - coin-or-clp=1.17.10 + - coin-or-osi=0.108.11 + - coin-or-utils=2.11.12 + - coincbc=2.10.12 - colorama=0.4.6 - coloredlogs=15.0.1 - colormath=3.0.0 - - configargparse=1.5.3 + - conda-inject=1.3.2 + - configargparse=1.7 - connection_pool=0.0.3 - - contourpy=1.0.7 - - cryptography=39.0.0 - - curl=7.86.0 - - cutadapt=4.4 - - cycler=0.11.0 + - contourpy=1.3.1 + - curl=8.11.1 + - cutadapt=5.0 + - cycler=0.12.1 - datrie=0.8.2 - - dbus=1.13.6 - decorator=5.1.1 - - deeptools=3.5.2 + - deeptools=3.5.5 - deeptoolsintervals=0.1.9 - - dnaio=0.10.0 - - docutils=0.20.1 - - dpath=2.1.5 + - dnaio=1.2.2 + - docutils=0.21.2 + - dpath=2.2.0 + - eido=0.2.4 - epic2=0.0.52 - - exceptiongroup=1.1.1 - - execnet=1.9.0 - - executing=1.2.0 - - expat=2.5.0 - - fastq-screen=0.15.3 + - et_xmlfile=2.0.0 + - exceptiongroup=1.2.2 + - execnet=2.1.1 + - executing=2.1.0 + - expat=2.6.4 + - fastq-screen=0.16.0 - fastqc=0.12.1 - - fftw=3.3.10 - - filelock=3.12.0 - font-ttf-dejavu-sans-mono=2.37 - font-ttf-inconsolata=3.000 - font-ttf-source-code-pro=2.038 - font-ttf-ubuntu=0.83 - - fontconfig=2.14.2 + - fontconfig=2.15.0 - fonts-conda-ecosystem=1 - fonts-conda-forge=1 - - fonttools=4.39.4 + - fonttools=4.55.3 - freetype=2.12.1 - fribidi=1.0.10 - - future=0.18.3 - - gat=1.3.6 - - gcc_impl_linux-64=10.4.0 - - gcc_linux-64=10.4.0 - - gettext=0.21.1 + - gcc_impl_linux-64=14.2.0 - gffread=0.12.7 - - gffutils=0.11.1 - - gfortran_impl_linux-64=10.4.0 - - gfortran_linux-64=10.4.0 - - giflib=5.2.1 - - gitdb=4.0.10 - - gitpython=3.1.31 - - glib=2.74.1 - - glib-tools=2.74.1 - - gmp=6.2.1 + - gffutils=0.13 + - gfortran_impl_linux-64=14.2.0 + - giflib=5.2.2 + - gitdb=4.0.12 + - gitpython=3.1.44 - graphite2=1.3.13 - - gsl=2.7 - - gst-plugins-base=1.18.5 - - gstreamer=1.20.3 - - gxx_impl_linux-64=10.4.0 - - gxx_linux-64=10.4.0 - - harfbuzz=4.2.0 - - hdf5=1.12.1 + - gsl=1.16 + - gxx_impl_linux-64=14.2.0 + - h2=4.1.0 + - harfbuzz=10.1.0 + - hdf5=1.14.3 - hisat2=2.2.1 - - htslib=1.16 + - hpack=4.0.0 + - html5lib=1.1 + - htslib=1.21 - humanfriendly=10.0 - - icu=69.1 - - idna=3.4 - - importlib-metadata=6.6.0 - - importlib_resources=5.12.0 + - humanize=4.11.0 + - hyperframe=6.0.1 + - icu=75.1 + - idna=3.10 + - imagesize=1.4.1 + - immutables=0.21 + - importlib-metadata=8.5.0 + - importlib_resources=6.5.2 - iniconfig=2.0.0 - intervalstats=1.01 - - ipython=8.13.2 - - isa-l=2.30.0 - - jack=1.9.18 - - jedi=0.18.2 - - jinja2=3.1.2 - - jpeg=9e - - jsonschema=4.17.3 - - jupyter_core=5.3.0 - - kallisto=0.48.0 - - kernel-headers_linux-64=2.6.32 + - ipython=8.31.0 + - isa-l=2.31.0 + - jedi=0.19.2 + - jinja2=3.1.5 + - jsonschema=4.23.0 + - jsonschema-specifications=2024.10.1 + - jupyter_core=5.7.2 + - kaleido-core=0.2.1 + - kallisto=0.51.1 + - kernel-headers_linux-64=3.10.0 - keyutils=1.6.1 - - kiwisolver=1.4.4 - - krb5=1.19.3 - - lcms2=2.14 - - ld_impl_linux-64=2.39 + - kiwisolver=1.4.7 + - krb5=1.21.3 + - lcms2=2.16 + - ld_impl_linux-64=2.43 - lerc=4.0.0 + - libaec=1.1.3 - libblas=3.9.0 - - libbrotlicommon=1.0.9 - - libbrotlidec=1.0.9 - - libbrotlienc=1.0.9 - - libcap=2.64 + - libboost=1.85.0 + - libboost-devel=1.85.0 + - libboost-headers=1.85.0 + - libbrotlicommon=1.1.0 + - libbrotlidec=1.1.0 + - libbrotlienc=1.1.0 - libcblas=3.9.0 - - libclang=13.0.1 - libcups=2.3.3 - - libcurl=7.86.0 - - libdb=6.2.32 - - libdeflate=1.13 - - libedit=3.1.20191231 + - libcurl=8.11.1 + - libdeflate=1.23 + - libedit=3.1.20240808 - libev=4.33 - - libevent=2.1.10 - - libexpat=2.5.0 + - libexpat=2.6.4 - libffi=3.4.2 - - libflac=1.3.4 - - libgcc-devel_linux-64=10.4.0 - - libgcc-ng=12.2.0 + - libgcc=14.2.0 + - libgcc-devel_linux-64=14.2.0 + - libgcc-ng=14.2.0 - libgd=2.3.3 - - libgfortran-ng=12.2.0 - - libgfortran5=12.2.0 - - libglib=2.74.1 - - libgomp=12.2.0 - - libhwloc=2.8.0 + - libgfortran=14.2.0 + - libgfortran-ng=14.2.0 + - libgfortran5=14.2.0 + - libglib=2.82.2 + - libgomp=14.2.0 + - libhwloc=2.11.2 - libiconv=1.17 - libjemalloc=5.3.0 + - libjpeg-turbo=3.0.0 - liblapack=3.9.0 - liblapacke=3.9.0 - - libllvm13=13.0.1 - - libnghttp2=1.51.0 - - libnsl=2.0.0 - - libogg=1.3.4 - - libopenblas=0.3.21 - - libopus=1.3.1 - - libpng=1.6.39 - - libpq=14.5 - - libsanitizer=10.4.0 - - libsndfile=1.0.31 - - libsqlite=3.41.2 - - libssh2=1.10.0 - - libstdcxx-devel_linux-64=10.4.0 - - libstdcxx-ng=12.2.0 - - libtiff=4.4.0 - - libtool=2.4.7 - - libudev1=253 + - liblzma=5.6.3 + - liblzma-devel=5.6.3 + - libnghttp2=1.64.0 + - libnsl=2.0.1 + - libopenblas=0.3.28 + - libopenssl-static=3.4.0 + - libpng=1.6.45 + - libsanitizer=14.2.0 + - libsqlite=3.47.2 + - libssh2=1.11.1 + - libstdcxx=14.2.0 + - libstdcxx-devel_linux-64=14.2.0 + - libstdcxx-ng=14.2.0 + - libtiff=4.7.0 - libuuid=2.38.1 - - libvorbis=1.3.7 - - libwebp=1.2.4 - - libwebp-base=1.2.4 - - libxcb=1.13 - - libxkbcommon=1.0.3 - - libxml2=2.9.14 - - libzlib=1.2.13 - - lzo=2.10 - - lzstring=1.0.4 - - make=4.3 - - markdown=3.4.3 - - markdown-it-py=2.2.0 - - markupsafe=2.1.2 - - matplotlib=3.7.1 - - matplotlib-base=3.7.1 - - matplotlib-inline=0.1.6 - - mdurl=0.1.0 - - multiqc=1.14 + - libwebp-base=1.5.0 + - libxcb=1.17.0 + - libxcrypt=4.4.36 + - libxml2=2.13.5 + - libzlib=1.3.1 + - logmuse=0.2.8 + - logomaker=0.8 + - macs2=2.2.9.1 + - make=4.4.1 + - markdown=3.6 + - markdown-it-py=3.0.0 + - markupsafe=3.0.2 + - mathjax=2.7.7 + - matplotlib-base=3.10.0 + - matplotlib-inline=0.1.7 + - mdurl=0.1.2 + - multiqc=1.26 - munkres=1.1.4 - - mysql-common=8.0.32 - mysql-connector-c=6.1.11 - - mysql-libs=8.0.32 - natsort=8.4.0 - - nbformat=5.8.0 - - ncbi-vdb=3.0.2 - - ncurses=6.3 - - networkx=3.1 - - nspr=4.35 - - nss=3.89 - - numpy=1.23.5 - - openjdk=11.0.1 - - openjpeg=2.5.0 - - openssl=1.1.1t + - nbformat=5.10.4 + - ncbi-vdb=3.1.1 + - ncurses=6.5 + - networkx=3.4.2 + - nspr=4.36 + - nss=3.107 + - numpy=2.2.1 + - numpydoc=1.8.0 + - openjdk=23.0.1 + - openjpeg=2.5.3 + - openpyxl=3.1.5 + - openssl=3.4.0 - ossuuid=1.6.2 - - packaging=23.1 - - pandas=2.0.1 - - pandoc=3.1.2 - - pango=1.50.7 - - parso=0.8.3 - - patsy=0.5.3 + - packaging=24.2 + - pandas=2.2.3 + - pandoc=3.6.1 + - pango=1.54.0 + - parso=0.8.4 + - patsy=1.0.1 - pbzip2=1.1.13 - - pcre2=10.37 + - pcre2=10.44 + - pephubclient=0.4.4 + - peppy=0.40.7 - perl=5.32.1 - - perl-alien-build=2.48 + - perl-alien-build=2.84 - perl-alien-libxml2=0.17 - perl-business-isbn=3.007 - perl-business-isbn-data=20210112.006 - perl-capture-tiny=0.48 - perl-carp=1.50 - perl-constant=1.33 - - perl-data-dumper=2.183 - - perl-encode=3.19 - perl-exporter=5.74 - perl-extutils-makemaker=7.70 - perl-ffi-checklib=0.28 @@ -233,152 +226,173 @@ dependencies: - perl-file-path=2.18 - perl-file-temp=0.2304 - perl-file-which=1.24 - - perl-gd=2.76 + - perl-gd=2.56 - perl-gdgraph=1.54 - perl-gdtextutil=0.86 - perl-importer=0.026 - - perl-mime-base64=3.16 - - perl-parent=0.241 + - perl-parent=0.243 - perl-path-tiny=0.124 - perl-pathtools=3.75 - perl-scope-guard=0.21 - - perl-storable=3.15 - perl-sub-info=0.002 - - perl-term-table=0.016 + - perl-term-table=0.024 - perl-test-fatal=0.016 - perl-test-warnings=0.031 - - perl-test2-suite=0.000145 + - perl-test2-suite=0.000163 - perl-try-tiny=0.31 - perl-uri=5.17 - - perl-xml-libxml=2.0207 + - perl-xml-libxml=2.0210 - perl-xml-namespacesupport=1.12 - perl-xml-sax=1.02 - perl-xml-sax-base=1.09 - - pexpect=4.8.0 + - pexpect=4.9.0 - picard=2.27.5 - pickleshare=0.7.5 - - pigz=2.6 - - pillow=9.2.0 - - pip=23.1.2 - - pixman=0.40.0 + - pigz=2.8 + - pillow=11.1.0 + - pip=24.3.1 + - pixman=0.44.2 - pkgutil-resolve-name=1.3.10 - - plac=1.3.5 - - platformdirs=3.5.1 - - plotly=5.14.1 - - pluggy=1.0.0 - - pooch=1.7.0 - - preseq=3.2.0 - - prompt-toolkit=3.0.38 - - prompt_toolkit=3.0.38 - - psutil=5.9.5 + - plac=1.4.3 + - platformdirs=4.3.6 + - plotly=5.24.1 + - pluggy=1.5.0 + - preseq=2.0.2 + - prompt-toolkit=3.0.48 + - psutil=6.1.1 - pthread-stubs=0.4 - ptyprocess=0.7.0 - - pulp=2.7.0 - - pulseaudio=14.0 - - pure_eval=0.2.2 + - pulp=2.8.0 + - pure_eval=0.2.3 - py2bit=0.3.0 - - pybedtools=0.9.0 - - pybigwig=0.3.18 - - pycparser=2.21 - - pyfaidx=0.7.2.1 - - pygments=2.15.1 - - pyopenssl=23.1.1 - - pyparsing=3.0.9 - - pyqt=5.15.4 - - pyqt5-sip=12.9.0 - - pyrsistent=0.19.3 - - pysam=0.20.0 + - pyaml-env=1.2.1 + - pybedtools=0.11.0 + - pybigwig=0.3.23 + - pycparser=2.22 + - pydantic=2.10.4 + - pydantic-core=2.27.2 + - pyfaidx=0.8.1.3 + - pygments=2.19.1 + - pyparsing=3.2.1 + - pysam=0.22.1 - pysocks=1.7.1 - - pytest=7.3.1 - - pytest-xdist=3.2.1 - - python=3.10.8 - - python-dateutil=2.8.2 - - python-fastjsonschema=2.16.3 - - python-isal=1.1.0 - - python-lzo=1.14 - - python-tzdata=2023.3 - - python_abi=3.10 - - pytz=2023.3 + - pytest=8.3.4 + - pytest-xdist=3.6.1 + - python=3.11.11 + - python-dateutil=2.9.0.post0 + - python-fastjsonschema=2.21.1 + - python-isal=1.7.1 + - python-kaleido=0.2.1 + - python-tzdata=2024.2 + - python-zlib-ng=0.5.1 + - python_abi=3.11 + - pytz=2024.1 - pyvcf3=1.0.3 - - pyyaml=6.0 - - qt-main=5.15.2 - - r-base=4.1.3 + - pyyaml=6.0.2 + - qhull=2020.2 + - r-base=4.2.3 - readline=8.2 - - requests=2.29.0 + - referencing=0.35.1 + - requests=2.32.3 - reretry=0.11.8 - - rich=13.3.5 - - rich-click=1.6.1 - - rseqc=5.0.1 - - salmon=1.10.1 - - samtools=1.16.1 - - scipy=1.10.1 - - seaborn=0.12.2 - - seaborn-base=0.12.2 + - rich=13.9.4 + - rich-click=1.8.5 + - rpds-py=0.22.3 + - rseqc=5.0.4 + - salmon=1.10.3 + - samtools=1.21 + - scipy=1.15.0 + - seaborn=0.13.2 + - seaborn-base=0.13.2 - sed=4.8 - - setuptools=67.7.2 - - simplejson=3.19.1 - - sip=6.5.1 - - six=1.16.0 - - smart_open=6.3.0 - - smmap=3.0.5 - - snakemake-minimal=7.25.3 + - setuptools=75.6.0 + - shellingham=1.5.4 + - simplejson=3.19.3 + - six=1.17.0 + - slack-sdk=3.34.0 + - slack_sdk=3.34.0 + - smart_open=7.1.0 + - smmap=5.0.0 + - snakemake=8.27.0 + - snakemake-interface-common=1.17.4 + - snakemake-interface-executor-plugins=9.3.3 + - snakemake-interface-report-plugins=1.1.0 + - snakemake-interface-storage-plugins=3.3.0 + - snakemake-minimal=8.27.0 + - snowballstemmer=2.2.0 + - soupsieve=2.5 - spectra=0.0.11 - - sqlite=3.41.2 - - sra-tools=3.0.3 - - stack_data=0.6.2 - - star=2.7.10b - - statsmodels=0.14.0 - - stopit=1.1.2 - - subread=2.0.3 - - sysroot_linux-64=2.12 + - sphinx=8.1.3 + - sphinxcontrib-applehelp=2.0.0 + - sphinxcontrib-devhelp=2.0.0 + - sphinxcontrib-htmlhelp=2.1.0 + - sphinxcontrib-jsmath=1.0.1 + - sphinxcontrib-qthelp=2.0.0 + - sphinxcontrib-serializinghtml=1.1.10 + - sqlite=3.47.2 + - sra-tools=3.1.1 + - stack_data=0.6.3 + - star=2.7.11b + - statsmodels=0.14.4 + - subread=2.0.8 + - sysroot_linux-64=2.17 - tabulate=0.9.0 - - tbb=2021.7.0 - - tenacity=8.2.2 - - throttler=1.2.1 - - tk=8.6.12 + - tbb=2022.0.0 + - tenacity=9.0.0 + - throttler=1.2.2 + - tk=8.6.13 - tktable=2.10 - - toml=0.10.2 - - tomli=2.0.1 - - toposort=1.10 - - tornado=6.3.2 - - trackhub=0.2.4 - - traitlets=5.9.0 - - typing-extensions=4.5.0 - - typing_extensions=4.5.0 - - tzdata=2023c - - ucsc-bedgraphtobigwig=377 - - ucsc-bedsort=377 - - ucsc-bedtobigbed=377 - - ucsc-bigwigmerge=377 - - ucsc-fetchchromsizes=377 - - ucsc-genepredtobed=377 - - ucsc-gtftogenepred=377 - - ucsc-liftover=377 - - ucsc-oligomatch=377 - - ucsc-twobittofa=377 - - ucsc-wigtobigwig=377 - - unicodedata2=15.0.0 - - urllib3=1.26.15 - - wcwidth=0.2.6 - - wheel=0.40.0 - - wrapt=1.15.0 - - xopen=1.7.0 - - xorg-kbproto=1.0.7 - - xorg-libice=1.0.10 - - xorg-libsm=1.2.3 - - xorg-libx11=1.8.4 - - xorg-libxau=1.0.9 - - xorg-libxdmcp=1.1.3 - - xorg-libxext=1.3.4 - - xorg-libxrender=0.9.10 - - xorg-libxt=1.2.1 - - xorg-renderproto=0.11.1 - - xorg-xextproto=7.3.0 - - xorg-xproto=7.0.31 - - xz=5.2.6 + - tomli=2.2.1 + - tqdm=4.67.1 + - trackhub=1.0 + - traitlets=5.14.3 + - typeguard=4.4.1 + - typer=0.15.1 + - typer-slim=0.15.1 + - typer-slim-standard=0.15.1 + - typing-extensions=4.12.2 + - typing_extensions=4.12.2 + - tzdata=2024b + - ubiquerg=0.8.0 + - ucsc-bedgraphtobigwig=472 + - ucsc-bedsort=469 + - ucsc-bedtobigbed=473 + - ucsc-bigwigmerge=469 + - ucsc-fetchchromsizes=469 + - ucsc-genepredtobed=469 + - ucsc-gtftogenepred=469 + - ucsc-liftover=469 + - ucsc-oligomatch=469 + - ucsc-stringify=472 + - ucsc-twobittofa=472 + - ucsc-wigtobigwig=472 + - unicodedata2=15.1.0 + - urllib3=2.3.0 + - veracitools=0.1.3 + - wcwidth=0.2.13 + - webencodings=0.5.1 + - wheel=0.45.1 + - wrapt=1.17.0 + - xopen=2.0.2 + - xorg-libice=1.1.2 + - xorg-libsm=1.2.5 + - xorg-libx11=1.8.10 + - xorg-libxau=1.0.12 + - xorg-libxdmcp=1.1.5 + - xorg-libxext=1.3.6 + - xorg-libxfixes=6.0.1 + - xorg-libxi=1.8.2 + - xorg-libxrandr=1.5.4 + - xorg-libxrender=0.9.12 + - xorg-libxt=1.3.1 + - xorg-libxtst=1.2.5 + - xz=5.6.3 + - xz-gpl-tools=5.6.3 + - xz-tools=5.6.3 - yaml=0.2.5 - - yte=1.5.1 - - zipp=3.15.0 - - zlib=1.2.13 - - zstandard=0.19.0 - - zstd=1.5.2 + - yte=1.5.5 + - zipp=3.21.0 + - zlib=1.3.1 + - zlib-ng=2.2.3 + - zstandard=0.23.0 + - zstd=1.5.6 diff --git a/include/WRAPPER_SLURM b/include/WRAPPER_SLURM index b2a2ffd4f..9f7f1344e 100755 --- a/include/WRAPPER_SLURM +++ b/include/WRAPPER_SLURM @@ -19,25 +19,25 @@ if [ -z "$LCDBWF_SNAKEMAKE_PROFILE" ]; then PROFILE_CMD="--profile $SNAKEMAKE_PROFILE" fi else -# LCDBWF_SNAKEMAKE_PROFILE found, this takes priority if both profile variables are set +# LCDBWF_SNAKEMAKE_PROFILE takes priority if both profile variables are set PROFILE_CMD="--profile $LCDBWF_SNAKEMAKE_PROFILE" fi -# Run snakemake +# Timestamped log file +LOGFILE="Snakefile_$(date +"%Y-%m-%d_%H%M").log" + ( time snakemake \ - -p \ + --printshellcmds \ --directory $PWD \ - -k \ - --restart-times 3 \ + --keep-going \ --rerun-incomplete \ --jobname "s.{rulename}.{jobid}.sh" \ - -j 999 \ --use-conda \ --configfile config/config.yaml \ $PROFILE_CMD \ "$@" - ) > "Snakefile.log" 2>&1 + ) > "$LOGFILE" 2>&1 SNAKE_PID=$! diff --git a/include/reference_configs/Plodia_interpunctella.yaml b/include/reference_configs/Plodia_interpunctella.yaml new file mode 100644 index 000000000..214e907fc --- /dev/null +++ b/include/reference_configs/Plodia_interpunctella.yaml @@ -0,0 +1,41 @@ +references: + plodia: + ilPloInte3.2: + genome: + url: 'https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_027563975.2/download?include_annotation_type=GENOME_FASTA' + postprocess: + function: 'lib.postprocess.utils.extract_from_zip' + kwargs: + path_in_zip: 'ncbi_dataset/data/GCF_027563975.2/GCF_027563975.2_ilPloInte3.2_genomic.fna' + indexes: + - 'hisat2' + - 'bowtie2' + - 'star' + + annotation: + url: "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_027563975.2/download?include_annotation_type=GENOME_GTF" + postprocess: + function: 'lib.postprocess.utils.extract_from_zip' + kwargs: + path_in_zip: "ncbi_dataset/data/GCF_027563975.2/genomic.gtf" + conversions: + - 'refflat' + - 'bed12' + + transcriptome: + indexes: + - 'salmon' + - 'kallisto' + + rRNA: + genome: + url: + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' + indexes: + - 'hisat2' + - 'bowtie2' + - 'star' + postprocess: + function: 'lib.common.filter_fastas' + args: 'Plodia interpunctella' diff --git a/include/requirements.txt b/include/requirements.txt index 6001f6d55..a2b21ee33 100644 --- a/include/requirements.txt +++ b/include/requirements.txt @@ -4,16 +4,17 @@ bowtie bowtie2 cutadapt>=3.0 deeptools +epic2 fastq-screen fastqc font-ttf-dejavu-sans-mono -gat gffread gffutils hisat2 intervalstats ipython kallisto +macs2 multiqc pandas pandoc @@ -27,7 +28,7 @@ pyfaidx pysam pytest pytest-xdist -python>=3.10 +python rseqc # earlier versions of salmon can segfault on Slurm @@ -35,7 +36,7 @@ salmon>=1.10.1 samtools seaborn -snakemake-minimal +snakemake>8 sra-tools star subread diff --git a/lib/aligners.py b/lib/aligners.py deleted file mode 100644 index 62fe58a57..000000000 --- a/lib/aligners.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Helper functions for working with aligners within Snakefiles -""" - - -def hisat2_index_from_prefix(prefix): - """ - Given a prefix, return a list of the corresponding hisat2 index files. - """ - return ['{prefix}.{n}.ht2'.format(prefix=prefix, n=n) for n in range(1, 9)] - - -def prefix_from_hisat2_index(index_files): - """ - Given a list of index files for hisat2, return the corresponding prefix. - """ - if isinstance(index_files, str): - return '.'.join(index_files.split('.')[:-2]) - else: - prefixes = list( - set( - map( - lambda x: '.'.join(x.split('.')[:-2]), index_files) - ) - ) - if len(prefixes) != 1: - raise ValueError( - "More than one prefix detected from '{0}'".format(prefixes) - ) - return prefixes[0] - - -def bowtie2_index_from_prefix(prefix): - """ - Given a prefix, return a list of the corresponding bowtie2 index files. - """ - return ( - [ - '{prefix}.{n}.bt2'.format(prefix=prefix, n=n) - for n in range(1, 5) - ] + [ - '{prefix}.rev.{n}.bt2'.format(prefix=prefix, n=n) - for n in range(1, 3) - ] - ) - - -def prefix_from_bowtie2_index(index_files): - """ - Given a list of index files for bowtie2, return the corresponding prefix. - """ - if isinstance(index_files, str): - return '.'.join(index_files.replace('.rev', '').split('.')[:-2]) - else: - prefixes = list( - set( - map( - lambda x: '.'.join(x.replace('.rev', '').split('.')[:-2]), - index_files) - ) - ) - if len(prefixes) != 1: - raise ValueError( - "More than one prefix detected from '{0}'".format(prefixes) - ) - return prefixes[0] - -def fastq_arg_from_input(fastqs): - """ - Prepares the correct input FASTQ arguments for bowtie2 and HISAT2 based on - whether or not the sample is paired-end. - - Parameters - ---------- - fastqs : list-like - List or snakemake.input object containing fastq filenames. - """ - - if isinstance(fastqs, str) or len(fastqs) == 1: - fastqs = '-U {0} '.format(fastqs) - else: - assert len(fastqs) == 2 - fastqs = '-1 {0} -2 {1} '.format(*fastqs) - return fastqs - diff --git a/lib/chipseq.py b/lib/chipseq.py index 887bb9f94..62608ed8c 100644 --- a/lib/chipseq.py +++ b/lib/chipseq.py @@ -1,9 +1,11 @@ +from snakemake.io import expand + """ Helpers for ChIP-seq. """ # Example config for reference -# __example_config__ = { +# { # 'peak_calling': { # [ # { @@ -24,7 +26,32 @@ # ] # } # } +# +# This needs to be expanded out to the following patterns: +# +# [ +# 'data/chipseq_peaks/macs2/rep1/peaks.bigbed', +# 'data/chipseq_peaks/macs2/rep2/peaks.bigbed', +# ] +# +# Which in turn needs these bams: +# +# [ +# expand(patterns['merged_techreps'], label=['input_1', 'ip_1']), +# expand(patterns['merged_techreps'], label=['input_2', 'ip_2']), +# +# +def add_bams_to_peak_calling(config): + d = peak_calling_dict(config) + for key, block in d.items(): + peak_calling_run, algorithm = key + block['ip_bams'] = expand('data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam', label=block['ip']) + block['control_bams'] = expand('data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam', label=block['control']) + block['bed'] = f"data/chipseq_peaks/{algorithm}/{peak_calling_run}/peaks.bed" + block['bigbed'] = f"data/chipseq_peaks/{algorithm}/{peak_calling_run}/peaks.bigbed" + d[key] = block + return d def peak_calling_dict(config, algorithm=None): """ @@ -60,11 +87,6 @@ def peak_calling_dict(config, algorithm=None): if key in d: raise ValueError("peak calling run '{0}' already defined".format(key)) - # If metadata key has been provided, then use that to populate the - # block as default values. - metadata = config['references'][config['organism']][config['aligner']['tag']].get('metadata', {}) - block.update(metadata) - d[key] = block return d @@ -139,7 +161,7 @@ def merged_input_for_ip(sampletable, merged_ip): ... input1 input s2cell-1 s2cell-input-1 ... input3 input s2cell-2 s2cell-input-3 ... input9 input s2cell-1 s2cell-input-1'''), - ... sep='\s+') + ... sep='\\s+') >>> merged_input_for_ip(df, 's2cell-gaf-1') diff --git a/lib/common.py b/lib/common.py deleted file mode 100644 index 829cc1298..000000000 --- a/lib/common.py +++ /dev/null @@ -1,914 +0,0 @@ -import glob -import subprocess -import time -import os -import warnings -import urllib.request as request -import contextlib -import yaml -import pandas -from Bio import SeqIO -import gzip -import binascii -from lib.imports import resolve_name -from lib import aligners -from lib import utils -from snakemake.shell import shell -from snakemake.io import expand - -# List of possible keys in config that are to be interpreted as paths -PATH_KEYS = [ - 'references_dir', - 'sampletable', - 'sample_dir', - 'aggregation_dir', - 'merged_dir', - 'peaks_dir', - 'hub_config', -] - - -def _is_gzipped(fn): - """ - Filename-independent method of checking if a file is gzipped or not. Uses - the magic number. - - xref https://stackoverflow.com/a/47080739 - """ - with open(fn, 'rb') as f: - return binascii.hexlify(f.read(2)) == b'1f8b' - - -def openfile(tmp, mode): - """ - Returns an open file handle; auto-detects gzipped files. - """ - if _is_gzipped(tmp): - return gzip.open(tmp, mode) - else: - return open(tmp, mode) - - -def resolve_config(config, workdir=None): - """ - Finds the config file. - - Parameters - ---------- - config : str, dict - If str, assume it's a YAML file and parse it; otherwise pass through - - workdir : str - Optional location to specify relative location of all paths in `config` - """ - if isinstance(config, str): - config = yaml.load(open(config), Loader=yaml.FullLoader) - - def rel(pth): - if workdir is None or os.path.isabs(pth): - return pth - return os.path.join(workdir, pth) - for key in PATH_KEYS: - if key in config: - config[key] = rel(config[key]) - return config - - -def gzipped(tmpfiles, outfile): - """ - Cat-and-gzip a list of uncompressed files into a compressed output file. - """ - with gzip.open(outfile, 'wt') as fout: - for f in tmpfiles: - with open(f) as infile: - for line in infile: - fout.write(line) - - -def cat(tmpfiles, outfile): - """ - Simple concatenation of files. - - Note that gzipped files can be concatenated as-is without un- and re- - compressing. - """ - shell('cat {tmpfiles} > {outfile}') - - -def filter_fastas(tmpfiles, outfile, pattern): - """ - Extract records from fasta file(s) given a search pattern. - - Given input gzipped FASTAs, create a new gzipped fasta containing only - records whose description matches `pattern`. - - Parameters - ---------- - tmpfiles : list - gzipped fasta files to look through - - outfile : str - gzipped output fastq file - - pattern : str - Look for this string in each record's description - - """ - def gen(): - for tmp in tmpfiles: - handle = gzip.open(tmp, 'rt') - parser = SeqIO.parse(handle, 'fasta') - for rec in parser: - if pattern not in rec.description: - continue - rec.seq = rec.seq.back_transcribe() - rec.description = rec.name - yield rec - - with gzip.open(outfile, 'wt') as fout: - SeqIO.write(gen(), fout, 'fasta') - - -def twobit_to_fasta(tmpfiles, outfile): - """ - Converts .2bit files to fasta. - - Parameters - ---------- - tmpfiles : list - 2bit files to convert - - outfile : str - gzipped output fastq file - """ - # Note that twoBitToFa doesn't support multiple input files, but we want to - # support them with this function - lookup = {i: i + '.fa' for i in tmpfiles} - for i in tmpfiles: - fn = lookup[i] - shell('twoBitToFa {i} {fn}') - - # Make sure we retain the order of the originally-provided files from the - # config when concatenating. - fastas = [lookup[i] for i in tmpfiles] - shell('cat {fastas} | gzip -c > {outfile}') - shell('rm {fastas}') - - -def download_and_postprocess(outfile, config, organism, tag, type_): - """ - Given an output file, figure out what to do based on the config. - - See notes below for details. - - Parameters - ---------- - outfile : str - - config : dict - - organism : str - Which organism to use. Must be a key in the "references" section of the - config. - - tag : str - Which tag for the organism to use. Must be a tag for the organism in - the config - - type_ : str - A supported references type (gtf, fasta) to use. - - Notes - ----- - - This function: - - - uses `organism`, `tag`, `type_` as a key into the config dict to - figure out: - - - what postprocessing function (if any) was specified along with - its optional args - - the URL[s] to download - - - resolves the name of the postprocessing function (if provided) and - imports it - - downloads the URL[s] to tempfile[s] - - calls the imported postprocessing function using the tempfile[s] and - outfile plus any additional specified arguments. - - - The postprocessing function must have one of the following signatures, - where `infiles` contains the list of temporary files downloaded from the - URL or URLs specified, and `outfile` is a gzipped file expected to be - created by the function:: - - def func(infiles, outfile): - pass - - or:: - - def func(infiles, outfile, *args): - pass - - or:: - - def func(infiles, outfile, *args, **kwargs): - pass - - - The function is specified as a string that resolves to an importable - function, e.g., `postprocess: lib.postprocess.dm6.fix` will call a function - called `fix` in the file `lib/postprocess/dm6.py`. - - If the contents of `postprocess:` is a dict, it must have at least the key - `function`, and optionally `args` and/or `kwargs` keys. The `function` key - indicates the importable path to the function. `args` can be a string - or list of arguments that will be provided as additional args to a function - with the second kind of signature above. If `kwargs` is provided, it is - a dict that is passed to the function with the third kind of signature - above. For example:: - - postprocess: - function: lib.postprocess.dm6.fix - args: - - True - - 3 - - or:: - - postprocess: - function: lib.postprocess.dm6.fix - args: - - True - - 3 - kwargs: - skip: exon - - """ - - def default_postprocess(origfn, newfn): - """ - If no other postprocess function is defined, then simply move the - original to the new. - """ - shell("mv {origfn} {newfn}") - - block = config['references'][organism][tag][type_] - - # postprocess can be missing, in which case we use the default above - post_process = block.get('postprocess', None) - - if not isinstance(post_process, list): - post_process = [post_process] - - funcs = [] - func_tmpfiles = [] - for i, post_process_block in enumerate(post_process): - if post_process_block is None: - func = default_postprocess - args = () - kwargs = {} - name = None - - # postprocess can have a single string value (indicating the function) or - # it can be a dict with keys "function" and optionally "args". The value of - # "args" can be a string or a list. - else: - if isinstance(post_process_block, dict): - name = post_process_block.get('function', post_process) - args = post_process_block.get('args', ()) - kwargs = post_process_block.get('kwargs', {}) - if isinstance(args, str): - args = (args,) - elif isinstance(post_process_block, str): - name = post_process_block - args = () - kwargs = {} - - # In the special case where there is kwarg beginning and ending - # with "__", this can be a dotted function name so it will be - # resolved here as well and passed along to the postprocessing - # function. - # - # This makes it possible to do things like add ERCC annotations on - # the end of other annotations that themselves need to be - # post-processed. - for kw in kwargs: - if kw.startswith('__') and kw.endswith('__'): - kwargs[kw] = resolve_name(kwargs[kw]) - - # import the function - func = resolve_name(name) - - tmp_outfile = f'{outfile}.{i}.{name}.tmp' - func_tmpfiles.append(tmp_outfile) - funcs.append([func, args, kwargs, tmp_outfile]) - - # The last func's outfile should be the final outfile - funcs[-1][-1] = outfile - - # as described in the docstring above, functions are to assume a list of - # urls - urls = block['url'] - if isinstance(urls, str): - urls = [urls] - - # Download tempfiles into reasonably-named filenames - tmpfiles = ['{0}.{1}.tmp'.format(outfile, i) for i in range(len(urls))] - tmpinputfiles = tmpfiles - try: - for url, tmpfile in zip(urls, tmpfiles): - if url.startswith('file:'): - url = url.replace('file://', '') - shell('cp {url} {tmpfile} 2> {outfile}.log') - else: - shell("wget {url} -O- > {tmpfile} 2> {outfile}.log") - - for func, args, kwargs, outfile in funcs: - func(tmpinputfiles, outfile, *args, **kwargs) - tmpinputfiles = [outfile] - - except Exception as e: - raise e - finally: - for i in tmpfiles + func_tmpfiles: - if os.path.exists(i): - shell('rm {i}') - - -def references_dict(config): - """ - Transforms the references section of the config file. - - The references section of the config file is designed to be human-editable, - and to only need the URL(s). User-specified indexes, conversions, and - post-processing functions can also be added. - - For example, the config might say:: - - human: - gencode: - fasta: - indexes: - - hisat2 - - In this function, we need to convert that "indexes: [hisat2]" into the full - path of the hisat2 index that can be used as input for a Snakemake rule. In - this example, in the dictionary returned below we can then get that path - with `d['human']['gencode']['hisat2']`, or more generally, - `d[organism][tag][type]`. - - Parameters - ---------- - config : dict - - Notes - ----- - - The config file is designed to be easy to edit and use from the user's - standpoint. But it's not so great for practical usage. Here we convert the - config file which has the format:: - - ... references_dir: "/data" - ... references: - ... dm6: - ... r6-11: - ... metadata: - ... reference_genome_build: 'dm6' - ... reference_effective_genome_count: 1.2e7 - ... reference_effective_genome_proportion: 0.97 - ... genome: - ... url: "" - ... indexes: - ... - bowtie2 - ... - hisat2 - ... annotation: - ... url: "" - ... conversions: - ... - refflat - ... transcriptome: - ... indexes: - ... - salmon - - To this format:: - - ... 'dm6': { - ... 'r6-11': { - ... 'annotation': '/data/dm6/r6-11/annotation/dm6_r6-11.gtf', - ... 'bowtie2': '/data/dm6/r6-11/genome/bowtie2/dm6_r6-11.1.bt2', - ... 'bowtie2_fasta': '/data/dm6/r6-11/genome/bowtie2/dm6_r6-11.fasta', - ... 'chromsizes': '/data/dm6/r6-11/genome/dm6_r6-11.chromsizes', - ... 'genome': '/data/dm6/r6-11/genome/dm6_r6-11.fasta', - ... 'hisat2': '/data/dm6/r6-11/genome/hisat2/dm6_r6-11.1.ht2', - ... 'hisat2_fasta': '/data/dm6/r6-11/genome/hisat2/dm6_r6-11.fasta', - ... 'refflat': '/data/dm6/r6-11/annotation/dm6_r6-11.refflat', - ... 'salmon': '/data/dm6/r6-11/transcriptome/salmon/dm6_r6-11/versionInfo.json', - ... 'salmon_fasta': '/data/dm6/r6-11/transcriptome/salmon/dm6_r6-11.fasta', - ... 'transcriptome': '/data/dm6/r6-11/transcriptome/dm6_r6-11.fasta', - ... }, - ... } - - """ - if isinstance(config, str): - config = yaml.load(open(config), Loader=yaml.FullLoader) - - references_dir = get_references_dir(config) - - # Map "indexes" value to a pattern specific to each index. - index_extensions = { - 'bowtie2': aligners.bowtie2_index_from_prefix('')[0], - 'hisat2': aligners.hisat2_index_from_prefix('')[0], - 'star': '/Genome', - - # Notes on salmon indexing: - # - pre-1.0 versions had hash.bin - # - post-1.0 versions do not have hash.bin but do have several other - # different .bin files - # - both appear to have versionInfo.json - # - # In order to support both, we use a filename found in common between - # the version. - 'salmon': '/versionInfo.json', - 'kallisto': '/transcripts.idx', - } - - conversion_extensions = { - - 'intergenic': '.intergenic.gtf', - 'refflat': '.refflat', - 'gffutils': '.gtf.db', - 'bed12': '.bed12', - 'genelist': '.genelist', - 'annotation_hub': '.{keytype}.csv', - 'mappings': '.mapping.tsv.gz', - } - - d = {} - conversion_kwargs = {} - - merged_references = config['references'] - - type_extensions = { - 'genome': 'fasta', - 'annotation': 'gtf', - 'transcriptome': 'fasta' - } - - for organism in merged_references.keys(): - d[organism] = {} - for tag in merged_references[organism].keys(): - e = {} - for type_, block in merged_references[organism][tag].items(): - if type_ == 'metadata': - continue - try: - type_extension = type_extensions[type_] - - except KeyError: - raise ValueError( - - "KeyError: " + type_ + "\n" - "\nConfig file format has changed:\n" - " - 'fasta:' -> 'genome:'\n" - " - 'gtf:' -> 'annotation:'\n" - " - new 'transcriptome:' section\n" - "\nSee docs for details\n\n" - - ) - e[type_] = ( - '{references_dir}/' - '{organism}/' - '{tag}/' - '{type_}/' - '{organism}_{tag}.{type_extension}'.format(**locals()) - ) - - # Add conversions if specified. - if type_ == 'annotation': - conversions = block.get('conversions', []) - for conversion in conversions: - kwargs = {} - if isinstance(conversion, dict): - # if conversion is specified as dict, we assume - # that there is only one key, and that key is the - # actual name of the conversion; the corresponding - # value will be kwargs. This is used e.g. for - # gffutils conversion which often need some - # tweaking of args depending on the gtf format. - assert len(list(conversion.keys())) == 1 - kwargs = list(conversion.values())[0] - conversion = list(conversion.keys())[0] - - # While the full set of columns for annotation hub are - # not known in advance, we can assume at least the - # keytype provided will be an output file. Fill that in - # here. - if conversion == 'annotation_hub': - keytype = kwargs['keytype'] - ext = conversion_extensions[conversion].format(keytype=keytype) - else: - ext = conversion_extensions[conversion] - output = ( - '{references_dir}/' - '{organism}/' - '{tag}/' - '{type_}/' - '{organism}_{tag}{ext}'.format(**locals()) - ) - e[conversion] = output - - conversion_kwargs[output] = kwargs - - if type_ in ['genome', 'transcriptome']: - # Add indexes if specified - indexes = block.get('indexes', []) - for index in indexes: - ext = index_extensions[index] - - e[index] = ( - '{references_dir}/{organism}/{tag}/{type_}/{index}/{organism}_{tag}{ext}' - .format(**locals()) - ) - - # Each index will get the original fasta symlinked over - # to its directory - e[index + '_fasta'] = ( - '{references_dir}/{organism}/{tag}/{type_}/{index}/{organism}_{tag}.fasta' - .format(**locals()) - ) - - # Only makes sense to have chromsizes for genome fasta, not transcriptome. - if type_ == 'genome': - e['chromsizes'] = ( - '{references_dir}/' - '{organism}/' - '{tag}/' - '{type_}/' - '{organism}_{tag}.chromsizes'.format(**locals()) - ) - d[organism][tag] = e - return d, conversion_kwargs - - -def get_references_dir(config): - """ - Identify the references directory based on config and env vars. - - Returns the references dir, preferring the value of an existing environment - variable `REFERENCES_DIR` over the config entry "references_dir". Raise an - error if either can't be found. - - Parameters - ---------- - config : dict - """ - config = resolve_config(config) - references_dir = os.environ.get( - 'REFERENCES_DIR', config.get('references_dir', None)) - if references_dir is None: - raise ValueError('No references dir specified') - return references_dir - - -def get_sampletable(config): - """ - Return samples and pandas.DataFrame of parsed sampletable. - - Returns the sample IDs and the parsed sampletable from the file specified - in the config. - - The sample IDs are assumed to be the first column of the sampletable. - - Parameters - ---------- - config : dict - """ - config = resolve_config(config) - sampletable = pandas.read_csv(config['sampletable'], comment="#", sep='\t') - samples = sampletable.iloc[:, 0] - return samples, sampletable - - -def get_techreps(sampletable, label): - """ - Return all sample IDs for which the "label" column is `label`. - """ - # since we're not requiring a name but we want to use `loc` - first_col = sampletable.columns[0] - result = list(sampletable.loc[sampletable['label'] == label, first_col]) - - # If we're using a ChIP-seq-like sampletable we can provide a more - # informative error message. - - is_chipseq = 'antibody' in sampletable.columns - if is_chipseq: - err = (""" - No technical replicates found for label '{}'. Check the ChIP-seq config - file to ensure the peak-calling section only specifies values from the - sampletable's "label" column.""".format(label) - ) - else: - err = "No technical replicates found for label '{}'.".format(label) - - if len(result) == 0: - raise ValueError(err) - - return result - - -def load_config(config, missing_references_ok=False): - """ - Loads the config. - - Resolves any included references directories/files and runs the deprecation - handler. - """ - if isinstance(config, str): - config = yaml.load(open(config), Loader=yaml.FullLoader) - - # Here we populate a list of reference sections. Items later on the list - # will have higher priority - includes = config.get('include_references', []) - for i in includes: - if not os.path.exists(i): - raise ValueError("include_references: '{}' does not exist".format(i)) - reference_sections = [] - - # First the directories. Directories that come earlier lose to those that - # come later. - for dirname in filter(os.path.isdir, includes): - # Note we're looking recursively for .yaml and .yml, so very large - # reference directories are possible - for fn in glob.glob(os.path.join(dirname, '**/*.y?ml'), - recursive=True): - refs = yaml.load(open(fn), Loader=yaml.FullLoader).get('references', None) - if refs is None: - if not missing_references_ok: - raise ValueError("No 'references:' section in {0}".format(fn)) - else: - reference_sections.append(refs) - - # Now the files - for fn in filter(os.path.isfile, includes): - refs = yaml.load(open(fn), Loader=yaml.FullLoader).get('references', None) - if refs is None: - if not missing_references_ok: - raise ValueError("No 'references:' section in {0}".format(fn)) - else: - reference_sections.append(refs) - - # The last thing we include is the references section as written in the - # config, which wins over all. - reference_sections.append(config.get('references', {})) - - merged_references = {} - for ref in reference_sections: - for organism in ref.keys(): - org_dict = merged_references.get(organism, {}) - for tag in ref[organism].keys(): - org_dict[tag] = ref[organism][tag] - merged_references[organism] = org_dict - config['references'] = merged_references - - # Run the deprecation handler on the final config - config = deprecation_handler(config) - - return config - - -def deprecation_handler(config): - """ - Checks the config to see if anything has been deprecated. - - Also makes any fixes that can be done automatically. - """ - if 'assembly' in config: - config['organism'] = config['assembly'] - warnings.warn( - "'assembly' should be replaced with 'organism' in config files. " - "As a temporary measure, a new 'organism' key has been added with " - "the value of 'assembly'", - DeprecationWarning) - - for org, block1 in config.get('references', {}).items(): - for tag, block2 in block1.items(): - gtf_conversions = block2.get('gtf', {}).get('conversions', []) - for c in gtf_conversions: - if isinstance(c, dict) and 'annotation_hub' in c: - warnings.warn( - "You may want to try the 'mappings' conversion rather " - "than 'annotation_hub' since it works directly off " - "the GTF file rather than assuming concordance between " - "GTF and AnnoationHub instances", - DeprecationWarning) - - return config - - -def is_paired_end(sampletable, sample): - """ - Inspects the sampletable to see if the sample is paired-end or not - - Parameters - ---------- - sampletable : pandas.DataFrame - Contains a "layout" or "LibraryLayout" column (but not both). If the - lowercase value is "pe" or "paired", consider the sample paired-end. - Otherwise consider single-end. - - sample : str - Assumed to be found in the first column of `sampletable` - """ - # We can't fall back to detecting PE based on two fastq files provided for - # each sample when it's an SRA sampletable (which only has SRR accessions). - # - # So detect first detect if SRA sampletable based on presence of "Run" - # column and all values of that column starting with "SRR", and then raise - # an error if the Layout column does not exist. - - if "Run" in sampletable.columns: - if all(sampletable["Run"].str.startswith("SRR")): - if "Layout" not in sampletable.columns and "layout" not in sampletable.columns: - raise ValueError( - "Sampletable appears to be SRA, but no 'Layout' column " - "found. This is required to specify single- or paired-end " - "libraries.") - - row = sampletable.set_index(sampletable.columns[0]).loc[sample] - if 'orig_filename_R2' in row: - return True - if 'layout' in row and 'LibraryLayout' in row: - raise ValueError("Expecting column 'layout' or 'LibraryLayout', " - "not both") - try: - return row['layout'].lower() in ['pe', 'paired'] - except KeyError: - pass - try: - return row['LibraryLayout'].lower() in ['pe', 'paired'] - except KeyError: - pass - return False - - -def fill_r1_r2(sampletable, pattern, r1_only=False): - """ - Returns a function intended to be used as a rule's input function. - - The returned function, when provided with wildcards, will return one or two - rendered versions of a pattern depending on SE or PE respectively. - Specifically, given a pattern (which is expected to contain a placeholder - for "{sample}" and "{n}"), look up in the sampletable whether or not it is - paired-end. - - Parameters - ---------- - - sampletable : pandas.DataFrame - Contains a "layout" column with either "SE" or "PE", or "LibraryLayout" - column with "SINGLE" or "PAIRED". If column does not exist, assume SE. - - pattern : str - Must contain at least a "{sample}" placeholder. - - r1_only : bool - If True, then only return the file for R1 even if PE is configured. - """ - def func(wc): - try: - wc.sample - except AttributeError: - raise ValueError( - 'Need "{{sample}}" in pattern ' - '"{pattern}"'.format(pattern=pattern)) - n = [1] - if is_paired_end(sampletable, wc.sample) and not r1_only: - n = [1, 2] - res = expand(pattern, sample=wc.sample, n=n) - return res - return func - - -def pluck(obj, kv): - """ - For a given dict or list that somewhere contains keys `kv`, return the - values of those keys. - - Named after the dplyr::pluck, and implemented based on - https://stackoverflow.com/a/1987195 - """ - if isinstance(obj, list): - for i in obj: - for x in pluck(i, kv): - yield x - elif isinstance(obj, dict): - if kv in obj: - yield obj[kv] - for j in obj.values(): - for x in pluck(j, kv): - yield x - - -def check_url(url, verbose=False): - """ - Try to open -- and then immediately close -- a URL. - - Any exceptions can be handled upstream. - - """ - - # Some notes here: - # - # - A pure python implementation isn't great because urlopen seems to - # cache or hold sessions open or something. EBI servers reject responses - # because too many clients are connected. This doesn't happen using curl. - # - # - Using the requests module doesn't help, because urls can be ftp:// and - # requests doesn't support that. - # - # - Similarly, using asyncio and aiohttp works great for https, but not - # ftp (I couldn't get aioftp to work properly). - # - # - Not all servers support --head. An example of this is - # https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA. - # - # - Piping curl to head using the -c arg to use bytes seems to work. - # However, we need to set pipefail (otherwise because head exits 0 the - # whole thing exits 0). And in that case, we expect curl to exit every - # time with exit code 23, which is "failed to write output", because of - # the broken pipe. This is handled below. - # - if verbose: - print(f'Checking {url}') - - # Notes on curl args: - # - # --max-time to allow the server some seconds to respond - # --retry to allow multiple tries if transient errors (4xx for FTP, 5xx for HTTP) are found - # --silent to not print anything - # --fail to return non-zero exit codes for 404 (default is exit 0 on hitting 404) - # - # Need to run through bash explicitly to get the pipefail option, which in - # turn means running with shell=True - proc = subprocess.run(f'/bin/bash -o pipefail -c "curl --retry 3 --max-time 10 --silent --fail {url} | head -c 10 > /dev/null"', shell=True) - return proc - - -def check_urls(config, verbose=False): - """ - Given a config filename or existing object, extract the URLs and check - them. - - Parameters - ---------- - - config : str or dict - Config object to inspect - - verbose : bool - Print which URL is being checked - - wait : int - Number of seconds to wait in between checking URLs, to avoid - too-many-connection issues - """ - config = load_config(config, missing_references_ok=True) - failures = [] - urls = list(set(utils.flatten(pluck(config, 'url')))) - for url in urls: - if url.startswith('file://'): - continue - - res = check_url(url, verbose=verbose) - - # we expect exit code 23 because we're triggering SIGPIPE with the - # "|head -c" above. - if res.returncode and res.returncode != 23: - failures.append(f'FAIL with exit code {res.returncode}. Command was: {res.args}') - if failures: - output = '\n '.join(failures) - raise ValueError(f'Found problematic URLs. See https://ec.haxx.se/usingcurl/usingcurl-returns for explanation of exit codes.\n {output}') - - -def check_all_urls_found(verbose=True): - """ - Recursively loads all references that can be included and checks them. - Reports out if there are any failures. - """ - check_urls({'include_references': [ - 'include/reference_configs', - 'test/test_configs', - 'workflows/rnaseq/config', - 'workflows/chipseq/config', - 'workflows/references/config', - ]}, verbose=verbose) - - -def gff2gtf(gff, gtf): - """ - Converts a gff file to a gtf format using the gffread function from Cufflinks - """ - if _is_gzipped(gff[0]): - shell('gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}') - else: - shell('gffread {gff} -T -o- | gzip -c > {gtf}') diff --git a/lib/helpers.py b/lib/helpers.py deleted file mode 100644 index 053bca2b1..000000000 --- a/lib/helpers.py +++ /dev/null @@ -1,205 +0,0 @@ -import collections -import re -from itertools import product -import pandas as pd -from snakemake.shell import shell -from snakemake.io import expand, regex -from lib import common - - -class ConfigurationError(Exception): - pass - - -def detect_layout(sampletable): - """ - Identifies whether a sampletable represents single-end or paired-end reads. - - Raises NotImplementedError if there's a mixture. - """ - is_pe = [common.is_paired_end(sampletable, s) for s in sampletable.iloc[:, 0]] - if all(is_pe): - return "PE" - elif not any(is_pe): - return "SE" - else: - p = sampletable.iloc[is_pe, 0].to_list() - s = sampletable.iloc[[not i for i in is_pe], 0].to_list() - if len(p) > len(s): - report = f"SE samples: {s}" - else: - report = f"PE samples: {p}" - raise ValueError(f"Only a single layout (SE or PE) is supported. {report}") - - -def fill_patterns(patterns, fill, combination=product): - """ - Fills in a dictionary of patterns with the dictionary or DataFrame `fill`. - - >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = dict(sample=['one', 'two'], N=[1, 2]) - >>> sorted(fill_patterns(patterns, fill)['a']) - ['one_R1.fastq', 'one_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] - - >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = dict(sample=['one', 'two'], N=[1, 2]) - >>> sorted(fill_patterns(patterns, fill, zip)['a']) - ['one_R1.fastq', 'two_R2.fastq'] - - >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = pd.DataFrame({'sample': ['one', 'two'], 'N': [1, 2]}) - >>> sorted(fill_patterns(patterns, fill)['a']) - ['one_R1.fastq', 'two_R2.fastq'] - - """ - # In recent Snakemake versions (e.g., this happens in 5.4.5) file patterns - # with no wildcards in them are removed from expand when `zip` is used as - # the combination function. - # - # For example, in 5.4.5: - # - # expand('x', zip, d=[1,2,3]) == [] - # - # But in 4.4.0: - # - # expand('x', zip, d=[1,2,3]) == ['x', 'x', 'x'] - - def update(d, u, c): - for k, v in u.items(): - if isinstance(v, collections.abc.Mapping): - r = update(d.get(k, {}), v, c) - d[k] = r - else: - if isinstance(fill, pd.DataFrame): - d[k] = list(set(expand(u[k], zip, **fill.to_dict("list")))) - else: - d[k] = list(set(expand(u[k], c, **fill))) - if not d[k]: - d[k] = [u[k]] - return d - - d = {} - return update(d, patterns, combination) - - -def extract_wildcards(pattern, target): - """ - Return a dictionary of wildcards and values identified from `target`. - - Returns None if the regex match failed. - - Parameters - ---------- - pattern : str - Snakemake-style filename pattern, e.g. ``{output}/{sample}.bam``. - - target : str - Filename from which to extract wildcards, e.g., ``data/a.bam``. - - Examples - -------- - >>> pattern = '{output}/{sample}.bam' - >>> target = 'data/a.bam' - >>> expected = {'output': 'data', 'sample': 'a'} - >>> assert extract_wildcards(pattern, target) == expected - >>> assert extract_wildcards(pattern, 'asdf') is None - """ - m = re.compile(regex(pattern)).match(target) - if m: - return m.groupdict() - - -def rscript(string, scriptname, log=None): - """ - Saves the string as `scriptname` and then runs it - - Parameters - ---------- - string : str - Filled-in template to be written as R script - - scriptname : str - File to save script to - - log : str - File to redirect stdout and stderr to. If None, no redirection occurs. - """ - with open(scriptname, "w") as fout: - fout.write(string) - if log: - _log = "> {0} 2>&1".format(log) - else: - _log = "" - shell("Rscript {scriptname} {_log}") - - -def check_unique_fn(df): - """ - Raises an error if the fastq filenames are not unique - """ - fns = df["orig_filename"] - if "orig_filename_R2" in df.columns: - fns = pd.concat([fns, df["orig_filename_R2"]]) - if len(fns.unique()) < len(fns): - raise ValueError("Fastq filenames non unique, check the sampletable\n") - - -def check_unique_samplename(df): - """ - Raises an error if the samplenames are not unique - """ - ns = df.index - if len(ns.unique()) < len(ns): - raise ConfigurationError("Samplenames non unique, check the sampletable\n") - - -def preflight(config): - """ - Performs verifications on config and sampletable files - - Parameters - ---------- - config: yaml config object - """ - sampletable = pd.read_table(config["sampletable"], index_col=0, comment="#") - check_unique_samplename(sampletable) - if "orig_filename" in sampletable.columns: - check_unique_fn(sampletable) - - -def rnaseq_preflight(c): - if "kallisto" not in c.config: - raise ConfigurationError( - """ - Starting in v1.8, an additional 'kallisto' argument is expected - in the config file. Note that in the future this may be - automatically included, but for now please add the following to the - config, where 'tagname' is the tag for the reference of interest: - - kallisto: - tag: "tagname" - """ - ) - - -def chipseq_preflight(c): - pass - - -def strand_arg_lookup(config, lookup): - """ - Given a config object and lookup dictionary, confirm that the config has - correctly specified strandedness and then return the value for that key. - """ - if not config.stranded: - raise ConfigurationError( - "Starting in v1.8, 'stranded' is required in the config file. " - "Values can be 'unstranded', 'fr-firststrand' (R1 aligns antisense to original transcript), " - "or 'fr-secondstrand' (R1 aligns sense to original transcript). If you are not sure, " - "run the workflow with only the 'strand_check' rule, like " - "'snakemake -j 5 strand_check'." - ) - if config.stranded not in lookup: - keys = list(lookup.keys()) - raise KeyError(f"'{config.stranded}' not one of {keys}") - return lookup[config.stranded] diff --git a/lib/imports.py b/lib/imports.py deleted file mode 100644 index f790ef6fb..000000000 --- a/lib/imports.py +++ /dev/null @@ -1,22 +0,0 @@ -def resolve_name(name): - """ - Imports a specific object from a dotted path and returns just that object. - - From nose.utils.resolve_name (with the logging parts taken out) which in - turn is from unittest.TestLoader.loadTestByName - """ - parts = name.split('.') - parts_copy = parts[:] - while parts_copy: - try: - module = __import__('.'.join(parts_copy)) - break - except ImportError: - del parts_copy[-1] - if not parts_copy: - raise - parts = parts[1:] - obj = module - for part in parts: - obj = getattr(obj, part) - return obj diff --git a/lib/lcdbwf/R/plotting.R b/lib/lcdbwf/R/plotting.R index 9e7bc8e5c..f4aa9c41b 100644 --- a/lib/lcdbwf/R/plotting.R +++ b/lib/lcdbwf/R/plotting.R @@ -268,7 +268,7 @@ vargenes_heatmap <- function(rld, cols_for_grouping, n=50){ mat <- mat - rowMeans(mat) df <- as.data.frame(colData(rld)[, cols_for_grouping]) rownames(df) <- colnames(rld) - colnames(df) <- cols.for.grouping + colnames(df) <- cols_for_grouping pheatmap(mat, annotation_col=df, cluster_cols=TRUE) } diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py deleted file mode 100644 index 542d41161..000000000 --- a/lib/patterns_targets.py +++ /dev/null @@ -1,252 +0,0 @@ -""" -This module handles the reading and filling-in of patterns. It can be used from -within Snakefiles or in downstream (figure-making) scripts. -""" - -import os -import collections -import yaml -from . import common -from . import chipseq -from . import helpers - -HERE = os.path.abspath(os.path.dirname(__file__)) - -# Note: when adding support for new peak callers, add them here. -PEAK_CALLERS = ['macs2', 'spp', 'sicer', 'epic2'] - - -def update_recursive(d, u): - """ - Update dictionary `d` with items in dictionary `u`, recursively - """ - for k, v in u.items(): - if isinstance(v, collections.abc.Mapping): - d[k] = update_recursive(d.get(k, {}), v) - else: - d[k] = v - return d - - -class SeqConfig(object): - def __init__(self, config, patterns, workdir=None): - """ - This class takes care of common tasks related to config and patterns - files (reading the sampletable, etc) but is intended to be subclassed. - - Parameters - ---------- - config : str or dict - - patterns : str - Path to patterns YAML file - - workdir : str - Config, patterns, and all paths in `config` should be interpreted - as relative to `workdir` - """ - self.path = None - self.workdir = '.' - if workdir is not None: - config = os.path.join(workdir, config) - patterns = os.path.join(workdir, patterns) - self.workdir = workdir - - if isinstance(config, str): - self.path = config - - self.config = common.load_config( - common.resolve_config(config, workdir)) - - stranded = self.config.get('stranded', None) - self.stranded = None - if stranded: - if stranded in ('unstranded'): - self.stranded = 'unstranded' - elif stranded in ('fr-firststrand', 'ISR', 'SR', 'reverse'): - self.stranded = 'fr-firststrand' - elif stranded in ('fr-secondstrand', 'ISF', 'SF', 'forward'): - self.stranded = 'fr-secondstrand' - - # Read the config file and extract all sort of useful bits. This mostly - # uses the `common` module to handle the details. - self.config['references_dir'] = common.get_references_dir(self.config) - self.samples, self.sampletable = common.get_sampletable(self.config) - self.refdict, self.conversion_kwargs = common.references_dict(self.config) - self.organism = self.config['organism'] - self.patterns = yaml.load(open(patterns), Loader=yaml.FullLoader) - self.is_paired = helpers.detect_layout(self.sampletable) == 'PE' - if self.is_paired: - self.n = [1, 2] - else: - self.n = [1] - - helpers.preflight(self.config) - -class RNASeqConfig(SeqConfig): - def __init__(self, config, patterns, workdir=None): - """ - Config object specific to RNA-seq workflows. - - Fills in patterns to create targets by handling the by-sample and - by-aggregate sections separately. - - Parameters - ---------- - - config : dict - - patterns : str - Path to patterns YAML file - - workdir : str - Config, patterns, and all paths in `config` should be interpreted - as relative to `workdir` - """ - SeqConfig.__init__(self, config, patterns, workdir) - - self.fill = dict(sample=self.samples, n=self.n) - self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None) - self.targets = helpers.fill_patterns(self.patterns, self.fill, zip) - - # Then the aggregation - if self.patterns_by_aggregation is not None and 'merged_bigwigs' in self.config: - self.fill_by_aggregation = dict( - merged_bigwig_label=self.config['merged_bigwigs'].keys(), - ) - self.targets_by_aggregation = helpers.fill_patterns( - self.patterns_by_aggregation, - self.fill_by_aggregation - ) - self.targets.update(self.targets_by_aggregation) - self.patterns.update(self.patterns_by_aggregation) - - helpers.rnaseq_preflight(self) - - -class ChIPSeqConfig(SeqConfig): - def __init__(self, config, patterns, workdir=None): - """ - Config object specific to ChIP-seq workflows. - - Fills in patterns to create targets by handling the by-sample, by-peak, - and by-aggregate sections separately. - - Parameters - ---------- - - config : dict - - patterns : str - Path to patterns YAML file - - workdir : str - Config, patterns, and all paths in `config` should be interpreted - as relative to `workdir` - """ - SeqConfig.__init__(self, config, patterns, workdir) - - self.targets = {} - - # For ChIP-seq, the structure of the patterns is quite different for - # samples than it is for peaks. For example, the peaks do not have any - # sample info in the filenames but aggregate possibly many different samples - # - # So construct them separately, and then later update self.patterns and - # self.targets. - # - # The averaged bigwigs are also aggregated, but in a different way. - # They will be handled separately. - # - # First, the samples... - self.patterns_by_sample = self.patterns['patterns_by_sample'] - self.fill_by_sample = dict( - n=self.n, - sample=self.samples.values, - label=self.sampletable.label.values, - ip_label=self.sampletable.label[ - self.sampletable.antibody != 'input'].values - ) - self.targets_by_sample = helpers.fill_patterns( - self.patterns_by_sample, self.fill_by_sample) - - self.targets.update(self.targets_by_sample) - self.patterns.update(self.patterns_by_sample) - - # Then the aggregation... - self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None) - if self.patterns_by_aggregation is not None and 'merged_bigwigs' in self.config: - self.fill_by_aggregation = dict( - merged_bigwig_label=self.config['merged_bigwigs'].keys(), - ) - self.targets_by_aggregation = helpers.fill_patterns( - self.patterns_by_aggregation, - self.fill_by_aggregation - ) - self.targets.update(self.targets_by_aggregation) - self.patterns.update(self.patterns_by_aggregation) - - # Then the peaks... - # - - self.patterns_by_peaks = self.patterns['patterns_by_peaks'] - self.targets_for_peaks = {} - - # We need to fill in just those peak-calling runs that are specified - # for each peak-caller. For reference, here's an example - # `patterns_by_peaks` from the YAML: - # - # peaks: - # macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bed' - # spp: '{peak_calling}/spp/{spp_run}/peaks.bed' - # bigbed: - # macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bigbed' - # spp: '{peak_calling}/spp/{spp_run}/peaks.bigbed' - - - # Also note that the snakefile's all rule uses - # utils.flatten(c.targets['peaks']), but in the case where no - # peak-calling runs are specified these should be initialized, - # otherwise we'll get a KeyError. - self.targets['peaks'] = [] - self.targets['bigbed'] = [] - - for pc in PEAK_CALLERS: - # Extract out just the subset of `patterns_by_peaks` for this - # peak-caller e.g., from the example above, if pc='macs2' this - # would only be: - # - # peaks: - # macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bed' - # bigbed: - # macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bigbed' - # - _peak_patterns = { - k: {pc: v[pc]} for k, v in self.patterns_by_peaks.items() - } - - - # Fix for issue #166, which was caused by commit 8a211122: - # - # If no runs for the peak-caller are configured, this will be - # empty and we should continue on. - peaks_to_fill = list(chipseq.peak_calling_dict(self.config, algorithm=pc).keys()) - - if not peaks_to_fill: - continue - - _fill = {pc + '_run': peaks_to_fill} - - # The trick here is the recursive updating of targets_for_peaks. - # We're adding the filled-in runs of each peak caller to the - # targets as they're built. - update_recursive( - self.targets_for_peaks, - helpers.fill_patterns(_peak_patterns, _fill) - ) - - - self.targets.update(self.targets_for_peaks) - self.patterns.update(self.patterns_by_peaks) - - helpers.chipseq_preflight(self) diff --git a/lib/postprocess/adapters.py b/lib/postprocess/adapters.py deleted file mode 100644 index 1d8ab7ab9..000000000 --- a/lib/postprocess/adapters.py +++ /dev/null @@ -1,6 +0,0 @@ -from snakemake.shell import shell - -def fasta_postprocess(origfn, newfn): - shell( - "gzip -c {origfn} > {newfn} " - "&& rm {origfn}") diff --git a/lib/postprocess/dicty.py b/lib/postprocess/dicty.py deleted file mode 100644 index 237cbbdda..000000000 --- a/lib/postprocess/dicty.py +++ /dev/null @@ -1,18 +0,0 @@ -from Bio import SeqIO -import gzip -from snakemake.shell import shell - -def rrna_postprocess(tmpfiles, outfile): - def gen(): - for tmp in tmpfiles: - handle = gzip.open(tmp, 'rt') - parser = SeqIO.parse(handle, 'fasta') - for rec in parser: - if 'Dictyostelium discoideum' not in rec.description: - continue - rec.seq = rec.seq.back_transcribe() - rec.description = rec.name - yield rec - - with gzip.open(outfile, 'wt') as fout: - SeqIO.write(gen(), fout, 'fasta') diff --git a/lib/postprocess/hg19.py b/lib/postprocess/hg19.py deleted file mode 100644 index 8d0424323..000000000 --- a/lib/postprocess/hg19.py +++ /dev/null @@ -1,3 +0,0 @@ -from snakemake.shell import shell -def plus_lncrna_fasta_postprocess(tmpfiles, outfile): - shell('cat {tmpfiles} > {outfile}') diff --git a/lib/postprocess/hg38.py b/lib/postprocess/hg38.py deleted file mode 100644 index d21f54ada..000000000 --- a/lib/postprocess/hg38.py +++ /dev/null @@ -1,14 +0,0 @@ -import pybedtools -import gzip -from snakemake.shell import shell -import os - - -def strip_ensembl_version(infiles, outfile): - def transform(f): - f.attrs['gene_id'] = f.attrs['gene_id'].split('.')[0] - return f - with gzip.open(outfile, 'wt') as fout: - for infile in infiles: - for feature in pybedtools.BedTool(infile): - fout.write(str(transform(feature))) diff --git a/lib/postprocess/merge.py b/lib/postprocess/merge.py deleted file mode 100644 index c3d1686e0..000000000 --- a/lib/postprocess/merge.py +++ /dev/null @@ -1,32 +0,0 @@ -import os -from snakemake.shell import shell -from ..imports import resolve_name - -def file_merge(origfns, newfn, *args): - tmpfiles = ['{0}.{1}.sub.tmp'.format(newfn, i) for i in range(len(origfns))] - try: - for origfn, tmpfile, ppfunc in zip(origfns, tmpfiles, args): - print(ppfunc) - func = resolve_name(ppfunc) - func(origfn, tmpfile) - - if os.path.exists(newfn): - shell('rm {newfn}') - - if newfn.endswith('.gz'): - fn = newfn.replace('.gz', '') - for tmpfile in tmpfiles: - shell("gunzip -c {tmpfile} >> {fn}") - shell("gzip {fn}") - else: - for tmpfile in tmpfiles: - shell("cat {tmpfile} >> {newfn}") - - except Exception as e: - raise e - - finally: - for i in tmpfiles: - if os.path.exists(i): - shell('rm {i}') - diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py index abb872880..f8fc64a62 100644 --- a/lib/postprocess/utils.py +++ b/lib/postprocess/utils.py @@ -1,11 +1,48 @@ import sys import os -import pandas as pd -import gzip import re +import gzip +import zipfile +import shutil +import tempfile +import pandas as pd + here = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(0, os.path.join(here, '../../lib')) -from common import openfile +sys.path.insert(0, os.path.join(here, "../../lib")) +from utils import openfile + + + +def extract_from_zip(tmpfiles, outfile, path_in_zip): + """ + Parameters + ---------- + + tmpfiles : list + One-item list containing zip file + + outfile : str + gzipped output file to create + + path_in_zip : str + Path within zipfile to extract. You can identify the path using unzip + -l x.zip from bash. + """ + assert len(tmpfiles) == 1, f"expected single zip file, got {tmpfiles}" + + extraction_dir = tempfile.mkdtemp() + + with zipfile.ZipFile(tmpfiles[0], "r") as z: + z.extract(path_in_zip, path=extraction_dir) + + full_path_to_extracted = os.path.join(extraction_dir, path_in_zip) + + with open(full_path_to_extracted, "rb") as fin: + with gzip.open(outfile, "wb") as fout: + shutil.copyfileobj(fin, fout) + + shutil.rmtree(extraction_dir) + def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand = "None"): """ diff --git a/lib/test_suite.py b/lib/test_suite.py index 21b9c0527..eb018c3ff 100644 --- a/lib/test_suite.py +++ b/lib/test_suite.py @@ -1,88 +1 @@ -import os -import pprint -from textwrap import dedent -from . import common - - -def test_config_loading(tmpdir): - f0 = tmpdir.mkdir('subdir').join('file0.yaml') - dir_to_include = tmpdir.join('subdir') - f0.write(dedent(''' - references: - species_to_keep: - tag_from_directory: - fasta: - url: "https://from_directory" - - # Will get overwritten by a specific file - tag_from_file: - fasta: - url: "https://from_directory" - - # Will get overwritten by specific file, and then that will get - # overwritten by the config - tag_from_config: - fasta: - url: "https://from_directory" - ''')) - f1 = tmpdir.join('subdir', 'file1.yaml') - f1.write(dedent(''' - references: - species2: - tag_only_in_directory: - fasta: - url: "" - indexes: - - bowtie2 - ''')) - - f2 = tmpdir.join('file1.yaml') - f2.write(dedent(''' - references: - species_to_keep: - tag_from_file: - fasta: - url: "https://from_file" - tag_from_config: - fasta: - url: "https://from_file" - - ''')) - - f3 = tmpdir.join('file3.yaml') - f3.write(dedent(''' - references_dir: "/data" - references: - species_to_keep: - tag_from_config: - fasta: - url: "https://from_config" - - include_references: - - {dir_to_include} - - {f2} - '''.format(dir_to_include=dir_to_include, f2=f2))) - - config = common.load_config(str(f3)) - - assert config == { - 'references_dir': '/data', - 'include_references': [ - '{0}/subdir'.format(str(tmpdir)), - '{0}/file1.yaml'.format(str(tmpdir)), - ], - 'references': { - 'species_to_keep': { - 'tag_from_config': { - 'fasta': {'url': 'https://from_config'}}, - 'tag_from_directory': { - 'fasta': {'url': 'https://from_directory'}}, - 'tag_from_file': { - 'fasta': {'url': 'https://from_file'}} - }, - 'species2': { - 'tag_only_in_directory': { - 'fasta': {'indexes': ['bowtie2'], 'url': ''}}}, - }, - } - +from . import utils diff --git a/lib/utils.py b/lib/utils.py index 3c2808905..0e5cc9e2c 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1,9 +1,53 @@ -import os -import contextlib +import binascii import collections +import contextlib +import gzip +import os +import re +import subprocess +import warnings from collections.abc import Iterable +from itertools import product + +import pandas +import pandas as pd +import yaml +from Bio import SeqIO +from snakemake.io import expand, regex_from_filepattern from snakemake.shell import shell +# Small helper functions + +def render_r1_r2(pattern): + return expand(pattern, sample='{sample}', n=c.n) + +def render_r1_only(pattern): + return expand(pattern, sample='{sample}', n=1) + + +def resolve_name(name): + """ + Imports a specific object from a dotted path and returns just that object. + + From nose.utils.resolve_name (with the logging parts taken out) which in + turn is from unittest.TestLoader.loadTestByName + """ + parts = name.split(".") + parts_copy = parts[:] + while parts_copy: + try: + module_ = __import__(".".join(parts_copy)) + break + except ImportError: + del parts_copy[-1] + if not parts_copy: + raise + parts = parts[1:] + obj = module_ + for part in parts: + obj = getattr(obj, part) + return obj + @contextlib.contextmanager def temp_env(env): @@ -52,22 +96,19 @@ def gen(): def test_flatten(): - assert ( - sorted( - flatten( - { - "a": { - "b": { - "c": ["a", "b", "c"], - }, + assert sorted( + flatten( + { + "a": { + "b": { + "c": ["a", "b", "c"], }, - "x": ["e", "f", "g"], - "y": {"z": "d"}, - } - ) + }, + "x": ["e", "f", "g"], + "y": {"z": "d"}, + } ) - == ["a", "b", "c", "d", "e", "f", "g"] - ) + ) == ["a", "b", "c", "d", "e", "f", "g"] assert flatten("a", True) == "a" assert flatten(["a"], True) == "a" @@ -171,7 +212,7 @@ def boolean_labels(names, idx, mapping={True: "AND", False: "NOT"}, strip="AND_" a_AND_b_AND_c_NOT_d_AND_e """ s = [] - for i, (n, x) in enumerate(zip(names, idx)): + for n, x in zip(names, idx): s.append(mapping[x] + "_" + n) s = "_".join(s) if s.startswith(strip): @@ -191,7 +232,188 @@ def make_relative_symlink(target, linkname): linkbase = os.path.basename(linkname) if not os.path.exists(linkdir): shell("mkdir -p {linkdir}") - shell("cd {linkdir}; ln -sf {relative_target} {linkbase}") + shell(f"cd {linkdir}; ln -sf {relative_target} {linkbase}") + + +def extract_wildcards(pattern, target): + """ + Return a dictionary of wildcards and values identified from `target`. + + Returns None if the regex match failed. + + Parameters + ---------- + pattern : str + Snakemake-style filename pattern, e.g. ``{output}/{sample}.bam``. + + target : str + Filename from which to extract wildcards, e.g., ``data/a.bam``. + + Examples + -------- + >>> pattern = '{output}/{sample}.bam' + >>> target = 'data/a.bam' + >>> expected = {'output': 'data', 'sample': 'a'} + >>> assert extract_wildcards(pattern, target) == expected + >>> assert extract_wildcards(pattern, 'asdf') is None + """ + m = re.compile(regex_from_filepattern(pattern)).match(target) + if m: + return m.groupdict() + + +def _is_gzipped(fn): + """ + Filename-independent method of checking if a file is gzipped or not. Uses + the magic number. + + xref https://stackoverflow.com/a/47080739 + """ + with open(fn, "rb") as f: + return binascii.hexlify(f.read(2)) == b"1f8b" + + +def openfile(tmp, mode): + """ + Returns an open file handle; auto-detects gzipped files. + """ + if _is_gzipped(tmp): + return gzip.open(tmp, mode) + else: + return open(tmp, mode) + + +def gzipped(tmpfiles, outfile): + """ + Cat-and-gzip a list of uncompressed files into a compressed output file. + """ + with gzip.open(outfile, "wt") as fout: + for f in tmpfiles: + with open(f) as infile: + for line in infile: + fout.write(line) + + +def cat(tmpfiles, outfile): + """ + Simple concatenation of files. + + Note that gzipped files can be concatenated as-is without un- and re- + compressing. + """ + shell(f"cat {tmpfiles} > {outfile}") + + +def is_paired_end(sampletable, sample): + """ + Inspects the sampletable to see if the sample is paired-end or not + + Parameters + ---------- + sampletable : pandas.DataFrame + Contains a "layout" or "LibraryLayout" column (but not both). If the + lowercase value is "pe" or "paired", consider the sample paired-end. + Otherwise consider single-end. + + sample : str + Assumed to be found in the first column of `sampletable` + """ + # We can't fall back to detecting PE based on two fastq files provided for + # each sample when it's an SRA sampletable (which only has SRR accessions). + # + # So detect first detect if SRA sampletable based on presence of "Run" + # column and all values of that column starting with "SRR", and then raise + # an error if the Layout column does not exist. + + if "Run" in sampletable.columns: + if all(sampletable["Run"].str.startswith("SRR")): + if ( + "Layout" not in sampletable.columns + and "layout" not in sampletable.columns + ): + raise ValueError( + "Sampletable appears to be SRA, but no 'Layout' column " + "found. This is required to specify single- or paired-end " + "libraries." + ) + + row = sampletable.set_index(sampletable.columns[0]).loc[sample] + if "orig_filename_R2" in row: + return True + if "layout" in row and "LibraryLayout" in row: + raise ValueError("Expecting column 'layout' or 'LibraryLayout', " "not both") + try: + return row["layout"].lower() in ["pe", "paired"] + except KeyError: + pass + try: + return row["LibraryLayout"].lower() in ["pe", "paired"] + except KeyError: + pass + return False + + +def fill_r1_r2(sampletable, pattern, r1_only=False): + """ + Returns a function intended to be used as a rule's input function. + + The returned function, when provided with wildcards, will return one or two + rendered versions of a pattern depending on SE or PE respectively. + Specifically, given a pattern (which is expected to contain a placeholder + for "{sample}" and "{n}"), look up in the sampletable whether or not it is + paired-end. + + Parameters + ---------- + + sampletable : pandas.DataFrame + Contains a "layout" column with either "SE" or "PE", or "LibraryLayout" + column with "SINGLE" or "PAIRED". If column does not exist, assume SE. + + pattern : str + Must contain at least a "{sample}" placeholder. + + r1_only : bool + If True, then only return the file for R1 even if PE is configured. + """ + + def func(wc): + try: + wc.sample + except AttributeError: + raise ValueError( + 'Need "{{sample}}" in pattern ' '"{pattern}"'.format(pattern=pattern) + ) + n = [1] + if is_paired_end(sampletable, wc.sample) and not r1_only: + n = [1, 2] + res = expand(pattern, sample=wc.sample, n=n) + return res + + return func + + +def pluck(obj, kv): + """ + For a given dict or list that somewhere contains keys `kv`, return the + values of those keys. + + Named after the dplyr::pluck, and implemented based on + https://stackoverflow.com/a/1987195 + """ + if isinstance(obj, list): + for i in obj: + for x in pluck(i, kv): + yield x + elif isinstance(obj, dict): + if kv in obj: + yield obj[kv] + for j in obj.values(): + for x in pluck(j, kv): + yield x + + +# Functions for conveniently working with resources def autobump(*args, **kwargs): @@ -308,7 +530,7 @@ def autobump(*args, **kwargs): raise ValueError(f"Unhandled args and kwargs: {args}, {kwargs}") def f(wildcards, attempt): - return baseline_converted + (attempt - 1) * increment_converted + return baseline_converted + (attempt - 1) * increment_converted return f @@ -319,3 +541,655 @@ def gb(size_in_gb): def hours(time_in_hours): return time_in_hours * 60 + + +# Config parsing and handling + + +class ConfigurationError(Exception): + pass + + +def detect_layout(sampletable): + """ + Identifies whether a sampletable represents single-end or paired-end reads. + + Raises NotImplementedError if there's a mixture. + """ + is_pe = [is_paired_end(sampletable, s) for s in sampletable.iloc[:, 0]] + if all(is_pe): + return "PE" + elif not any(is_pe): + return "SE" + else: + p = sampletable.iloc[is_pe, 0].to_list() + s = sampletable.iloc[[not i for i in is_pe], 0].to_list() + if len(p) > len(s): + report_ = f"SE samples: {s}" + else: + report_ = f"PE samples: {p}" + raise ValueError(f"Only a single layout (SE or PE) is supported. {report_}") + + +def fill_patterns(patterns, fill, combination=product): + """ + Fills in a dictionary of patterns with the dictionary `fill`. + + >>> patterns = dict(a='{sample}_R{N}.fastq') + >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2]) + >>> sorted(fill_patterns(patterns, fill)['a']) + ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] + + If using `zip` as a combination, checks to ensure all values in `fill` are + the same length to avoid truncated output. + + This fails: + + >>> patterns = dict(a='{sample}_R{N}.fastq') + >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2]) + >>> sorted(fill_patterns(patterns, fill, zip)['a']) # doctest: +IGNORE_EXCEPTION_DETAIL + Traceback (most recent call last): + ... + ValueError: {'sample': ['one', 'two', 'three'], 'N': [1, 2]} does not have the same number of entries for each key + + But this works: + + >>> patterns = dict(a='{sample}_R{N}.fastq') + >>> fill = dict(sample=['one', 'one', 'two', 'two', 'three', 'three'], N=[1, 2, 1, 2, 1, 2]) + >>> sorted(fill_patterns(patterns, fill, zip)['a']) + ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] + + """ + # In recent Snakemake versions (e.g., this happens in 5.4.5) file patterns + # with no wildcards in them are removed from expand when `zip` is used as + # the combination function. + # + # For example, in 5.4.5: + # + # expand('x', zip, d=[1,2,3]) == [] + # + # But in 4.4.0: + # + # expand('x', zip, d=[1,2,3]) == ['x', 'x', 'x'] + + if combination == zip: + lengths = set([len(v) for v in fill.values()]) + if len(lengths) != 1: + raise ValueError( + f"{fill} does not have the same number of entries for each key" + ) + + def update(d, u, c): + for k, v in u.items(): + if isinstance(v, collections.abc.Mapping): + r = update(d.get(k, {}), v, c) + d[k] = r + else: # not a dictionary, so we're at a leaf + if isinstance(fill, pd.DataFrame): + d[k] = list(set(expand(u[k], zip, **fill.to_dict("list")))) + else: + d[k] = list(set(expand(u[k], c, **fill))) + if not d[k]: + d[k] = [u[k]] + return d + + d = {} + return update(d, patterns, combination) + + +def rscript(string, scriptname, log=None): + """ + Saves the string as `scriptname` and then runs it + + Parameters + ---------- + string : str + Filled-in template to be written as R script + + scriptname : str + File to save script to + + log : str + File to redirect stdout and stderr to. If None, no redirection occurs. + """ + with open(scriptname, "w") as fout: + fout.write(string) + if log: + _log = "> {0} 2>&1".format(log) + else: + _log = "" + shell("Rscript {scriptname} {_log}") + + +def check_unique_fn(df): + """ + Raises an error if the fastq filenames are not unique + """ + fns = df["orig_filename"] + if "orig_filename_R2" in df.columns: + fns = pd.concat([fns, df["orig_filename_R2"]]) + if len(fns.unique()) < len(fns): + raise ValueError("Fastq filenames non unique, check the sampletable\n") + + +def check_unique_samplename(df): + """ + Raises an error if the samplenames are not unique + """ + ns = df.index + if len(ns.unique()) < len(ns): + raise ConfigurationError("Samplenames non unique, check the sampletable\n") + + +def preflight(config): + """ + Performs verifications on config and sampletable files + + Parameters + ---------- + config: yaml config object + """ + sampletable = pd.read_table(config["sampletable"], index_col=0, comment="#") + check_unique_samplename(sampletable) + if "orig_filename" in sampletable.columns: + check_unique_fn(sampletable) + + +def rnaseq_preflight(c): + pass + + +def chipseq_preflight(c): + pass + + +def strand_arg_lookup(config, lookup): + """ + Given a config object and lookup dictionary, confirm that the config has + correctly specified strandedness and then return the value for that key. + """ + if not config.stranded: + raise ConfigurationError( + "Starting in v1.8, 'stranded' is required in the config file. " + "Values can be 'unstranded', 'fr-firststrand' (R1 aligns antisense to original transcript), " + "or 'fr-secondstrand' (R1 aligns sense to original transcript). If you are not sure, " + "run the workflow with only the 'strand_check' rule, like " + "'snakemake -j 5 strand_check'." + ) + if config.stranded not in lookup: + keys = list(lookup.keys()) + raise KeyError(f"'{config.stranded}' not one of {keys}") + return lookup[config.stranded] + + +def filter_fastas(tmpfiles, outfile, pattern): + """ + Extract records from fasta file(s) given a search pattern. + + Given input gzipped FASTAs, create a new gzipped fasta containing only + records whose description matches `pattern`. + + Parameters + ---------- + tmpfiles : list + gzipped fasta files to look through + + outfile : str + gzipped output fastq file + + pattern : str + Look for this string in each record's description + + """ + + def gen(): + for tmp in tmpfiles: + handle = gzip.open(tmp, "rt") + parser = SeqIO.parse(handle, "fasta") + for rec in parser: + if pattern not in rec.description: + continue + rec.seq = rec.seq.back_transcribe() + rec.description = rec.name + yield rec + + with gzip.open(outfile, "wt") as fout: + SeqIO.write(gen(), fout, "fasta") + + +def twobit_to_fasta(tmpfiles, outfile): + """ + Converts .2bit files to fasta. + + Parameters + ---------- + tmpfiles : list + 2bit files to convert + + outfile : str + gzipped output fastq file + """ + # Note that twoBitToFa doesn't support multiple input files, but we want to + # support them with this function + lookup = {i: i + ".fa" for i in tmpfiles} + for i in tmpfiles: + fn = lookup[i] + shell("twoBitToFa {i} {fn}") + + # Make sure we retain the order of the originally-provided files from the + # config when concatenating. + fastas = [lookup[i] for i in tmpfiles] + shell("cat {fastas} | gzip -c > {outfile}") + shell("rm {fastas}") + + +def download_and_postprocess(urls, postprocess, outfile, log): + """ + Many reference files cannot be used as-is and need to be modified. + + This function supports providing one or more URLs, and any postprocess + functions to get the reference files usable. + + Parameters + ---------- + urls : str or list + URL(s) to download. Can be a list, in which case they will be concatenated. + + postprocess : str | dict | list | None + Postprocessing config. See below for details. + + outfile : str + Output filename to save final output. Expected to be gzipped. + + log : str + Log filename that will accumulate all logs + + Notes + ----- + + This function: + + - downloads the URL[s] to tempfile[s] + - resolves the name of the postprocessing function(s) if provided and + imports it + - calls the imported postprocessing function using the tempfile[s] and + outfile plus any additional specified arguments. + + The postprocessing function must have one of the following signatures, + where `infiles` contains the list of temporary files downloaded from the + URL or URLs specified, and `outfile` is a gzipped file expected to be + created by the function:: + + def func(infiles, outfile): + pass + + or:: + + def func(infiles, outfile, *args): + pass + + or:: + + def func(infiles, outfile, *args, **kwargs): + pass + + + The function is specified as a string that resolves to an importable + function, e.g., `postprocess: lib.postprocess.dm6.fix` will call a function + called `fix` in the file `lib/postprocess/dm6.py`. + + If the contents of `postprocess:` is a dict, it must have at least the key + `function`, and optionally `args` and/or `kwargs` keys. The `function` key + indicates the importable path to the function. `args` can be a string + or list of arguments that will be provided as additional args to a function + with the second kind of signature above. If `kwargs` is provided, it is + a dict that is passed to the function with the third kind of signature + above. For example:: + + postprocess: + function: lib.postprocess.dm6.fix + args: + - True + - 3 + + or:: + + postprocess: + function: lib.postprocess.dm6.fix + args: + - True + - 3 + kwargs: + skip: exon + + """ + + def default_postprocess(origfn, newfn): + shell("mv {origfn} {newfn}") + + if not isinstance(postprocess, list): + postprocess = [postprocess] + + # Will contain tuples of (func, args, kwargs, tmp_outfile) + funcs = [] + + # It is possible to chain multiple postprocessing functions together by + # providing them as a list. + # + # postprocess = [ + # + # "lib.func1", + # + # { + # "function": "lib.func2", + # "args": (True, True), + # }, + # + # { + # "function": "lib.func3", + # "args": (1, 2), + # "kwargs": {"gzipped": True), + # }, + # + # ] + # + for i, postprocess_i in enumerate(postprocess): + + if postprocess_i is None: + func = default_postprocess + args = () + kwargs = {} + name = None + + # postprocess can have a single string value indicating the function or + # it can be a dict with keys "function" and optionally "args". The value of + # "args" can be a string or a list. + else: + if isinstance(postprocess_i, dict): + name = postprocess_i.get("function", postprocess) + args = postprocess_i.get("args", ()) + kwargs = postprocess_i.get("kwargs", {}) + if isinstance(args, str): + args = (args,) + elif isinstance(postprocess_i, str): + name = postprocess_i + args = () + kwargs = {} + + else: + raise ValueError( + f"Unhandled type of postprocessing configuration: {postprocess_i}" + ) + + # In the special case where there is kwarg beginning and ending + # with "__", this can be a dotted function name so it will be + # resolved here as well and passed along to the postprocessing + # function. + # + # This makes it possible to do things like add ERCC annotations on + # the end of other annotations that themselves need to be + # post-processed. + for kw in kwargs: + if kw.startswith("__") and kw.endswith("__"): + kwargs[kw] = resolve_name(kwargs[kw]) + + # import the function + func = resolve_name(name) + + tmp_outfile = f"{outfile}.{i}.{name}.tmp" + funcs.append([func, args, kwargs, tmp_outfile]) + + # The last func's outfile should be the final outfile + funcs[-1][-1] = outfile + + # as described in the docstring above, functions are to assume a list of + # urls + if isinstance(urls, str): + urls = [urls] + + # Download into reasonably-named temp filenames + downloaded_tmpfiles = [f"{outfile}.{i}.tmp" for i in range(len(urls))] + + # For the first postprocess, its input will be all the downloaded files. + postprocess_input = downloaded_tmpfiles + try: + # Copy (if local URI) or download into the specified temp files + for url, tmpfile in zip(urls, downloaded_tmpfiles): + if url.startswith("file:"): + url = url.replace("file://", "") + shell("cp {url} {tmpfile} 2> {log}") + else: + shell("wget {url} -O- > {tmpfile} 2> {log}") + + for func, args, kwargs, tmp_outfile in funcs: + func( + # all downloaded files (if the first postprocess), or the + # output of the last postprocess + postprocess_input, + # the temp output for just this postprocess + tmp_outfile, + *args, + **kwargs, + ) + + # We want the next postprocess to use the output of what we just + # ran; as documented above the input files are expected to be in + # a list. + postprocess_input = [tmp_outfile] + + except Exception as e: + raise e + finally: + to_delete = downloaded_tmpfiles + + # all but the last postprocess func output (the last one is the final + # output that we want to keep!) + to_delete += [i[-1] for i in funcs[:-1]] + + for i in to_delete: + if os.path.exists(i): + shell("rm {i}") + if not _is_gzipped(outfile): + raise ValueError(f"{outfile} does not appear to be gzipped.") + + +def get_sampletable(config): + """ + Return samples and pandas.DataFrame of parsed sampletable. + + Returns the sample IDs and the parsed sampletable from the file specified + in the config. + + The sample IDs are assumed to be the first column of the sampletable. + + Parameters + ---------- + config : dict + """ + sampletable = pandas.read_csv(config["sampletable"], comment="#", sep="\t") + samples = sampletable.iloc[:, 0] + return samples, sampletable + + +def get_techreps(sampletable, label): + """ + Return all sample IDs for which the "label" column is `label`. + """ + # since we're not requiring a name but we want to use `loc` + first_col = sampletable.columns[0] + result = list(sampletable.loc[sampletable["label"] == label, first_col]) + + # If we're using a ChIP-seq-like sampletable we can provide a more + # informative error message. + + is_chipseq = "antibody" in sampletable.columns + if is_chipseq: + err = """ + No technical replicates found for label '{}'. Check the ChIP-seq config + file to ensure the peak-calling section only specifies values from the + sampletable's "label" column.""".format( + label + ) + else: + err = "No technical replicates found for label '{}'.".format(label) + + if len(result) == 0: + raise ValueError(err) + + return result + + +def deprecation_handler(config): + """ + Checks the config to see if anything has been deprecated. + + Also makes any fixes that can be done automatically. + """ + if "assembly" in config: + config["organism"] = config["assembly"] + warnings.warn( + "'assembly' should be replaced with 'organism' in config files. " + "As a temporary measure, a new 'organism' key has been added with " + "the value of 'assembly'", + DeprecationWarning, + ) + + for org, block1 in config.get("references", {}).items(): + for tag, block2 in block1.items(): + gtf_conversions = block2.get("gtf", {}).get("conversions", []) + for c in gtf_conversions: + if isinstance(c, dict) and "annotation_hub" in c: + warnings.warn( + "You may want to try the 'mappings' conversion rather " + "than 'annotation_hub' since it works directly off " + "the GTF file rather than assuming concordance between " + "GTF and AnnoationHub instances", + DeprecationWarning, + ) + + return config + + +def check_url(url, verbose=False): + """ + Try to open -- and then immediately close -- a URL. + + Any exceptions can be handled upstream. + + """ + + # Some notes here: + # + # - A pure python implementation isn't great because urlopen seems to + # cache or hold sessions open or something. EBI servers reject responses + # because too many clients are connected. This doesn't happen using curl. + # + # - Using the requests module doesn't help, because urls can be ftp:// and + # requests doesn't support that. + # + # - Similarly, using asyncio and aiohttp works great for https, but not + # ftp (I couldn't get aioftp to work properly). + # + # - Not all servers support --head. An example of this is + # https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA. + # + # - Piping curl to head using the -c arg to use bytes seems to work. + # However, we need to set pipefail (otherwise because head exits 0 the + # whole thing exits 0). And in that case, we expect curl to exit every + # time with exit code 23, which is "failed to write output", because of + # the broken pipe. This is handled below. + # + if verbose: + print(f"Checking {url}") + + # Notes on curl args: + # + # --max-time to allow the server some seconds to respond + # --retry to allow multiple tries if transient errors (4xx for FTP, 5xx for HTTP) are found + # --silent to not print anything + # --fail to return non-zero exit codes for 404 (default is exit 0 on hitting 404) + # + # Need to run through bash explicitly to get the pipefail option, which in + # turn means running with shell=True + proc = subprocess.run( + f'/bin/bash -o pipefail -c "curl --retry 3 --max-time 10 --silent --fail {url} | head -c 10 > /dev/null"', + shell=True, + ) + return proc + + +def check_urls(config, verbose=False): + """ + Given a config filename or existing object, extract the URLs and check + them. + + Parameters + ---------- + + config : str or dict + Config object to inspect + + verbose : bool + Print which URL is being checked + + wait : int + Number of seconds to wait in between checking URLs, to avoid + too-many-connection issues + """ + failures = [] + urls = list(set(flatten(pluck(config, "url")))) + for url in urls: + if url.startswith("file://"): + continue + + res = check_url(url, verbose=verbose) + + # we expect exit code 23 because we're triggering SIGPIPE with the + # "|head -c" above. + if res.returncode and res.returncode != 23: + failures.append( + f"FAIL with exit code {res.returncode}. Command was: {res.args}" + ) + if failures: + output = "\n ".join(failures) + raise ValueError( + f"Found problematic URLs. See https://ec.haxx.se/usingcurl/usingcurl-returns for explanation of exit codes.\n {output}" + ) + + +def check_all_urls_found(verbose=True): + """ + Recursively loads all references that can be included and checks them. + Reports out if there are any failures. + """ + check_urls( + { + "include_references": [ + "include/reference_configs", + "test/test_configs", + "workflows/rnaseq/config", + "workflows/chipseq/config", + "workflows/references/config", + ] + }, + verbose=verbose, + ) + + +def gff2gtf(gff, gtf): + """ + Converts a gff file to a gtf format using the gffread function from Cufflinks + """ + if _is_gzipped(gff[0]): + shell("gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}") + else: + shell("gffread {gff} -T -o- | gzip -c > {gtf}") + + +def wrapper_for(path): + return 'file:' + os.path.join('../..','wrappers', 'wrappers', path) + +def detect_sra(sampletable): + return 'Run' in sampletable.columns and any(sampletable['Run'].str.startswith('SRR')) + +# vim: ft=python diff --git a/scripts/bed_to_bigbed.py b/scripts/bed_to_bigbed.py new file mode 100644 index 000000000..13ab54442 --- /dev/null +++ b/scripts/bed_to_bigbed.py @@ -0,0 +1,56 @@ +import sys +import os +import numpy as np +import pandas as pd +from snakemake.shell import shell + +sys.path.insert(0, os.path.dirname(__file__) + "/..") +from lib import chipseq + +# Based on the filename, identify the algorithm; +# Based on the contents, identify the format. +algorithm = os.path.basename(os.path.dirname(snakemake.input.bed)) +kind = chipseq.detect_peak_format(snakemake.input.bed) + +# bedToBigBed doesn't handle zero-size files +if os.stat(snakemake.input.bed).st_size == 0: + shell("touch {output}") + +# Note that autoSql filenames are relative to the workdir of the snakefile +# calling this script. +elif kind == 'narrowPeak': + _as = '../../include/autosql/bigNarrowPeak.as' + _type = 'bed6+4' + names=[ + 'chrom', 'chromStart', 'chromEnd', 'name', 'score', + 'strand', 'signalValue', 'pValue', 'qValue', 'peak'] +elif kind == 'broadPeak': + _as = '../../include/autosql/bigBroadPeak.as' + _type = 'bed6+3' + names=[ + 'chrom', 'chromStart', 'chromEnd', 'name', 'score', + 'strand', 'signalValue', 'pValue', 'qValue'] +elif kind == 'epic2Input': + _as = f'../../include/autosql/{kind}Peak.as' + _type = 'bed6+4' + names=[ + 'chrom', 'chromStart', 'chromEnd', 'pValue', 'score', + 'strand', 'ChIPCount', 'InputCount', 'FDR', 'log2FoldChange'] +elif kind == 'epic2NoInput': + _as = f'../../include/autosql/{kind}Peak.as' + _type = 'bed6' + names=[ + 'chrom', 'chromStart', 'chromEnd', 'ChIPCount', 'score', + 'strand'] +else: + raise ValueError("Unhandled format for {0}".format(input.bed)) + +df = pd.read_table(snakemake.input.bed, index_col=False, names=names) +df['score'] = df['score'] - df['score'].min() +df['score'] = (df['score'] / df['score'].max()) * 1000 +df['score'] = df['score'].replace([np.inf, -np.inf], np.nan).fillna(0) +df['score'] = df['score'].astype(int) +df.to_csv(snakemake.output[0] + '.tmp', sep='\t', index=False, header=False) + +shell('bedToBigBed -as={_as} -type={_type} {snakemake.output}.tmp {snakemake.input.chromsizes} {snakemake.output} &> {snakemake.log}') +shell('rm {snakemake.output}.tmp') diff --git a/wrappers/wrappers/epic2/wrapper.py b/scripts/epic2.py similarity index 77% rename from wrappers/wrappers/epic2/wrapper.py rename to scripts/epic2.py index ee66e7669..6ac30bdbd 100644 --- a/wrappers/wrappers/epic2/wrapper.py +++ b/scripts/epic2.py @@ -2,8 +2,6 @@ import glob from snakemake import shell -log = snakemake.log_fmt_shell() -logfile = None extra = snakemake.params.get('extra', '') outdir, basebed = os.path.split(snakemake.output.bed) @@ -11,21 +9,17 @@ extra = snakemake.params.block.get('extra', '') # `-c` has to be skipped if no control is provided -# if os.path.isfile(snakemake.input.control): if len(snakemake.input.control) > 0: arguments = '-c {snakemake.input.control} ' else: arguments = '' -# Add `--guess-bampe` if input dataset is paired-end -if snakemake.params.is_paired: - arguments += '--guess-bampe ' shell( 'epic2 ' + arguments + extra + '-t {snakemake.input.ip} ' - '--chromsizes {snakemake.input.chromsizes} | ' - 'sort -k1,1 -k2,2n > {label}.tmp.bed ' + '--chromsizes {snakemake.input.chromsizes} 2> {snakemake.log} | ' + 'sort -k1,1 -k2,2n > {label}.tmp.bed' ) # Fix the output file so that it doesn't have negative numbers and so it fits diff --git a/wrappers/wrappers/macs2/callpeak/wrapper.py b/scripts/macs2_callpeak.py similarity index 100% rename from wrappers/wrappers/macs2/callpeak/wrapper.py rename to scripts/macs2_callpeak.py diff --git a/wrappers/wrappers/combos/merge_and_dedup/wrapper.py b/scripts/merge_and_dedup.py similarity index 100% rename from wrappers/wrappers/combos/merge_and_dedup/wrapper.py rename to scripts/merge_and_dedup.py diff --git a/scripts/rrna_libsizes_table.py b/scripts/rrna_libsizes_table.py new file mode 100644 index 000000000..ea2b68209 --- /dev/null +++ b/scripts/rrna_libsizes_table.py @@ -0,0 +1,68 @@ +""" +Prepares a TSV and JSON file for multiqc to pick up and display as a sortable +table +""" +import os +import re +import pandas as pd +import yaml +from snakemake.io import regex_from_filepattern + + +def rrna_sample(f): + m = re.compile( + regex_from_filepattern( + snakemake.params.rrna_pattern, + ) + ).match(f) + if m: + return m.groupdict()["sample"] + + +def sample(f): + m = re.compile( + regex_from_filepattern( + snakemake.params.fastq_pattern, + ) + ).match(f) + if m: + return m.groupdict()["sample"] + + +def million(f): + return float(open(f).read()) / 1e6 + + +rrna = sorted(snakemake.input.rrna, key=rrna_sample) +fastq = sorted(snakemake.input.fastq, key=sample) +samples = list(map(rrna_sample, rrna)) +rrna_m = list(map(million, rrna)) +fastq_m = list(map(million, fastq)) + +df = pd.DataFrame( + dict( + sample=samples, + million_reads_rRNA=rrna_m, + million_reads_fastq=fastq_m, + ) +) +df = df.set_index("sample") +df["rRNA_percentage"] = df.million_reads_rRNA / df.million_reads_fastq * 100 + +df[["million_reads_fastq", "million_reads_rRNA", "rRNA_percentage"]].to_csv( + snakemake.output.tsv, sep="\t" +) +y = { + "id": "rrna_percentages_table", + "section_name": "rRNA content", + "description": "Amount of reads mapping to rRNA sequence", + "plot_type": "table", + "pconfig": { + "id": "rrna_percentages_table_table", + "title": "rRNA content table", + "min": 0, + }, + "data": yaml.load(df.transpose().to_json(), Loader=yaml.FullLoader), +} +with open(snakemake.output.json, "w") as fout: + yaml.dump(y, fout, default_flow_style=False) diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test index df59b24c5..21f6978c7 100755 --- a/test/lcdb-wf-test +++ b/test/lcdb-wf-test @@ -142,9 +142,12 @@ class Runner(object): %(prog)s rnaseq --run-workflow --strandedness-pe %(prog)s rnaseq --run-workflow --strandedness-se %(prog)s rnaseq --run-workflow --star-2pass - %(prog)s rnaseq --run-workflow --star-1pass + %(prog)s rnaseq --run-workflow --hisat2 %(prog)s rnaseq --run-workflow --pe + # Since there are a lot of parameters here, see + # "workflow_test_params.yaml" for how they are managed. + """, formatter_class=argparse.RawDescriptionHelpFormatter ) @@ -328,7 +331,7 @@ class Runner(object): if args.url_check: print_header("url check") sys.path.insert(0, str(TOPLEVEL)) - from lib.common import check_all_urls_found + from lib.utils import check_all_urls_found check_all_urls_found() diff --git a/test/test_configs/hisat2.tsv b/test/test_configs/hisat2.tsv new file mode 100644 index 000000000..df6746cea --- /dev/null +++ b/test/test_configs/hisat2.tsv @@ -0,0 +1,3 @@ +samplename group layout orig_filename +sample1-hisat2 control SE data/example_data/rnaseq_sample1PE_1.fq.gz +sample2-hisat2 control SE data/example_data/rnaseq_sample2.fq.gz diff --git a/test/test_configs/override.yaml b/test/test_configs/override.yaml deleted file mode 100644 index bd05a9257..000000000 --- a/test/test_configs/override.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Due to the way Snakemake recursively merges config items, we need to -# recursively reset this dictonary to override the default one in order to -# allow arbitrary other sample names. -# -# Use it like this -# -# snakemake --configfile ../../test/override.yaml --config sampletable=/path/to/tsv -# -merged_bigwigs: - control_pos: - pos: [] - treatment_all: - pos: [] - neg: [] diff --git a/test/test_configs/star_1pass.tsv b/test/test_configs/star_1pass.tsv deleted file mode 100644 index 3c73275ea..000000000 --- a/test/test_configs/star_1pass.tsv +++ /dev/null @@ -1,3 +0,0 @@ -samplename group layout orig_filename -sample1-star-1pass control SE data/example_data/rnaseq_sample1PE_1.fq.gz -sample2-star-1pass control SE data/example_data/rnaseq_sample2.fq.gz diff --git a/test/test_configs/star_override_1pass.yaml b/test/test_configs/star_override_1pass.yaml deleted file mode 100644 index cba6ff764..000000000 --- a/test/test_configs/star_override_1pass.yaml +++ /dev/null @@ -1,10 +0,0 @@ -aligner: - index: star - tag: test - -merged_bigwigs: - control_pos: - pos: [] - treatment_all: - pos: [] - neg: [] diff --git a/test/test_configs/star_override_2pass.yaml b/test/test_configs/star_override_2pass.yaml deleted file mode 100644 index b091eba3d..000000000 --- a/test/test_configs/star_override_2pass.yaml +++ /dev/null @@ -1,10 +0,0 @@ -aligner: - index: 'star-twopass' - tag: test - -merged_bigwigs: - control_pos: - pos: [] - treatment_all: - pos: [] - neg: [] diff --git a/test/test_configs/test_rnaseq_config.yaml b/test/test_configs/test_rnaseq_config.yaml index 6c674345d..2cbd3d66c 100644 --- a/test/test_configs/test_rnaseq_config.yaml +++ b/test/test_configs/test_rnaseq_config.yaml @@ -1,43 +1,27 @@ -sampletable: 'config/sampletable.tsv' - -patterns: 'config/rnaseq_patterns.yaml' - -# Which key in the `references` dict below to use -organism: 'dmel' - -# If not specified here, use the environment variable REFERENCES_DIR. -references_dir: 'references_data' - -aligner: - index: 'hisat2' - tag: 'test' +fasta: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" + postprocess: 'lib.utils.gzipped' -stranded: 'fr-firststrand' +gtf: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf" + postprocess: 'lib.utils.gzipped' rrna: - index: 'bowtie2' - tag: 'rRNA' - -gtf: - tag: "test" + url: + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' + postprocess: + function: 'lib.utils.filter_fastas' + args: 'Drosophila melanogaster' -salmon: - tag: "test" -kallisto: - tag: "test" +sampletable: 'config/sampletable.tsv' -fastq_screen: - - label: rRNA - organism: dmel - tag: test - - label: Fly - organism: dmel - tag: test +patterns: 'config/rnaseq_patterns.yaml' -# See the reference config files in the top level of the repo, -# include/reference_configs, for inspiration for more species. +# See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info. +stranded: 'fr-firststrand' # for dUTP libraries +# 'fr-secondstrand' # for ligation libraries +# 'unstranded' # for libraries without strand specificity -include_references: - - '../../include/reference_configs/test.yaml' - - '../../include/reference_configs/Drosophila_melanogaster.yaml' +aligner: 'star' diff --git a/test/workflow_test_params.yaml b/test/workflow_test_params.yaml index 70e57da66..5d74fac98 100644 --- a/test/workflow_test_params.yaml +++ b/test/workflow_test_params.yaml @@ -45,19 +45,17 @@ rnaseq: desc: Tests running STAR in 2-pass mode. Only runs until the star_pass2 rule. args: | --until star_pass2 - --configfile - __ORIG__/test/test_configs/test_rnaseq_config.yaml - __ORIG__/test/test_configs/star_override_2pass.yaml + --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml --config sampletable=__ORIG__/test/test_configs/star_2pass.tsv + --config aligner="star-twopass" - star-1pass: - desc: Tests running STAR in 1-pass (default) mode. Only runs until the star rule. + hisat2: + desc: Tests running HISAT2 args: | - --until star - --configfile - __ORIG__/test/test_configs/test_rnaseq_config.yaml - __ORIG__/test/test_configs/star_override_1pass.yaml - --config sampletable=__ORIG__/test/test_configs/star_1pass.tsv + --until hisat2 + --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml + --config sampletable=__ORIG__/test/test_configs/hisat2.tsv + --config aligner=hisat2 pe: desc: Tests paired-end data diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index f278b8968..4dabbb52f 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -1,179 +1,140 @@ import sys -sys.path.insert(0, srcdir('../..')) import os -from textwrap import dedent import yaml -import tempfile import pandas as pd -import numpy as np -import pybedtools -from lib import common, utils, helpers, aligners, chipseq -from lib.patterns_targets import ChIPSeqConfig -from lib.utils import autobump, gb, hours -# ---------------------------------------------------------------------------- -# -# Search for the string "NOTE:" to look for points of configuration that might -# be helpful for your experiment. -# -# ---------------------------------------------------------------------------- +sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") +from lib import utils +from lib import chipseq -if not workflow.overwrite_configfiles: - configfile: 'config/config.yaml' -config = common.load_config(config) +configfile: "config/config.yaml" -include: '../references/Snakefile' -# Verify configuration of config and sampletable files -helpers.preflight(config) +include: "../references/Snakefile" -c = ChIPSeqConfig( - config, - config.get('patterns', 'config/chipseq_patterns.yaml') -) -SAMPLES = c.sampletable.iloc[:, 0].values +REFERENCES = config.get("reference_dir", "../../references") +sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") +sampletable = sampletable.set_index(sampletable.columns[0], drop=False) +is_paired = utils.detect_layout(sampletable) == "PE" +n = ["1", "2"] if is_paired else ["1"] +SAMPLES = sampletable.iloc[:, 0].values +LABELS = sampletable.label.values +peaks = chipseq.add_bams_to_peak_calling(config) -wildcard_constraints: - n = '[1,2]', - sample = '|'.join(SAMPLES) - - - -def wrapper_for(path): - return 'file:' + os.path.join('../..','wrappers', 'wrappers', path) - - -# ---------------------------------------------------------------------------- -# RULES -# ---------------------------------------------------------------------------- - -# See "patterns and targets" in the documentation for what's going on here. -final_targets = utils.flatten(( - c.targets['bam'], - utils.flatten(c.targets['fastqc']), - [c.targets['fastq_screen']], - [c.targets['multiqc']], - utils.flatten(c.targets['markduplicates']), - utils.flatten(c.targets['bigwig']), - utils.flatten(c.targets['peaks']), - utils.flatten(c.targets['merged_techreps']), - utils.flatten(c.targets['fingerprint']), - utils.flatten(c.targets['bigbed']), - utils.flatten(c.targets['multibigwigsummary']), - utils.flatten(c.targets['plotcorrelation']), -)) - -if config.get('merged_bigwigs', None): - final_targets.extend(utils.flatten(c.targets['merged_bigwig'])) +wildcard_constraints: + n="[1,2]", + sample="|".join(SAMPLES), -def render_r1_r2(pattern, r1_only=False): - return expand(pattern, sample='{sample}', n=c.n) +localrules: + symlinks, + symlink_targets, -def r1_only(pattern): - return expand(pattern, sample='{sample}', n=1) rule targets: - """ - Final targets to create - """ - input: final_targets - - -if 'orig_filename' in c.sampletable.columns: - - localrules: symlinks + input: + "data/chipseq_aggregation/multiqc.html", + expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=LABELS), + [v["bed"] for k, v in peaks.items()], - # Convert the sampletable to be indexed by the first column, for - # convenience in generating the input/output filenames. - _st = c.sampletable.set_index(c.sampletable.columns[0]) - def orig_for_sample(wc): - """ - Given a sample, returns either one or two original fastq files - depending on whether the library was single- or paired-end. - """ - if c.is_paired: - return _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] - return _st.loc[wc.sample, ['orig_filename']] +if utils.detect_sra(sampletable): + sampletable['orig_filename'] = expand( + 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1) + if is_paired: + sampletable['orig_filename_R2'] = expand( + 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2) - rule symlinks: - """ - Symlinks files over from original filename - """ - input: - orig_for_sample + rule fastq_dump: output: - render_r1_r2(c.patterns['fastq']) - threads: 1 + fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n, allow_missing=True) + log: + 'original_data/sra_samples/{sample}/{sample}.fastq.gz.log' + params: + is_paired=is_paired, + # extra="-X 100000", # [enable for test] resources: - mem_mb=gb(1), - runtime=10, + mem="1g", + disk="1g", + runtime="2h", run: - assert len(output) == len(input), (input, output) - for src, linkname in zip(input, output): - utils.make_relative_symlink(src, linkname) - - - rule symlink_targets: - input: c.targets['fastq'] - + srr = sampletable.loc[wildcards.sample, "Run"] + extra = params.get("extra", "") + if is_paired: + shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") + shell("mv {srr}_1.fastq.gz {output[0]}") + shell("mv {srr}_2.fastq.gz {output[1]}") + else: + shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp") + shell("mv {output[0]}.tmp {output[0]}") + + +rule symlinks: + input: + lambda wc: ( + sampletable.loc[wc.sample, ["orig_filename", "orig_filename_R2"]] + if is_paired + else sampletable.loc[wc.sample, ["orig_filename"]] + ), + output: + expand("data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", n=n, + allow_missing=True), + threads: 1 + resources: + mem="1g", + runtime="10m", + run: + assert len(output) == len(input), (input, output) + for src, linkname in zip(input, output): + utils.make_relative_symlink(src, linkname) -if 'Run' in c.sampletable.columns and sum(c.sampletable['Run'].str.startswith('SRR')) > 0: - # Convert the sampletable to be indexed by the first column, for - # convenience in generating the input/output filenames. - _st = c.sampletable.set_index(c.sampletable.columns[0]) +rule symlink_targets: + input: + expand( + "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=n + ), - rule fastq_dump: - output: - fastq=render_r1_r2(c.patterns['fastq']) - log: - r1_only(c.patterns['fastq'])[0] + '.log' - params: - is_paired=c.is_paired, - sampletable=_st, - # limit = 100000, # [TEST SETTINGS] - resources: - mem_mb=autobump(gb=8), - runtime=autobump(hours=2) - conda: - '../../wrappers/wrappers/fastq-dump/environment.yaml' - script: - wrapper_for('fastq-dump/wrapper.py') rule cutadapt: - """ - Run cutadapt - """ input: - fastq=render_r1_r2(c.patterns['fastq']) + fastq=expand( + "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", + n=n, allow_missing=True), output: - fastq=render_r1_r2(c.patterns['cutadapt']) - resources: - mem_mb=gb(2), - runtime=autobump(hours=2) + fastq=expand( + "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", + n=n, allow_missing=True), log: - render_r1_r2(c.patterns['cutadapt'])[0] + '.log' + "data/chipseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", threads: 6 + resources: + mem="2g", + runtime="2h", + params: + extra=( + ( + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + ) + + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " + if is_paired + else "" + ), run: - - # NOTE: Change cutadapt params here - if c.is_paired: + if is_paired: shell( "cutadapt " "-o {output[0]} " "-p {output[1]} " - "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " - '--nextseq-trim 20 ' - "--overlap 6 " - '-j {threads} ' - '--minimum-length 25 ' + "-j {threads} " + "{params.extra} " "{input.fastq[0]} " "{input.fastq[1]} " "&> {log}" @@ -182,67 +143,86 @@ rule cutadapt: shell( "cutadapt " "-o {output[0]} " - "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - '--nextseq-trim 20 ' - "--overlap 6 " - '-j {threads} ' - '--minimum-length 25 ' + "-j {threads} " + "{params.extra} " "{input.fastq[0]} " "&> {log}" ) rule fastqc: - """ - Run FastQC - """ input: - '{sample_dir}/{sample}/{sample}{suffix}' - threads: - 6 + "{sample_dir}/{sample}/{sample}{suffix}", + threads: 1 output: - html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html', - zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip', + html="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html", + zip="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip", resources: - mem_mb=gb(2), - runtime=autobump(hours=2) - script: - wrapper_for('fastqc/wrapper.py') + mem="8g", + runtime="2h", + log: + "{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.log", + run: + outdir = os.path.dirname(output.html) or "." + shell( + "fastqc " + "--noextract " + "--quiet " + "--outdir {outdir} " + "{input} " + "&> {log} " + ) + outfile = os.path.basename(input[0]) + for s in [".fastq", ".fq", ".gz", ".bam"]: + outfile = outfile.replace(s, "") + out_zip = os.path.join(outdir, outfile + "_fastqc.zip") + if not os.path.abspath(out_zip) == os.path.abspath(output.zip): + shell("mv {out_zip} {output.zip}") + out_html = os.path.join(outdir, outfile + "_fastqc.html") + if not os.path.abspath(out_html) == os.path.abspath(output.html): + shell("mv {out_html} {output.html}") rule bowtie2: - """ - Map reads with Bowtie2 - """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['aligner']['tag']]['bowtie2']] + fastq=expand( + "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", n=n, allow_missing=True), + index=multiext( + f"{REFERENCES}/bowtie2/genome", + ".1.bt2", + ".2.bt2", + ".3.bt2", + ".4.bt2", + ".rev.1.bt2", + ".rev.2.bt2", + ".fa", + ), output: - bam=c.patterns['bam'] + bam=temporary("data/chipseq_samples/{sample}/{sample}.cutadapt.bam"), log: - c.patterns['bam'] + '.log' + "data/chipseq_samples/{sample}/{sample}.cutadapt.bam.log", threads: 16 resources: - mem_mb=gb(32), - runtime=autobump(hours=2) + mem="32g", + runtime="2h", + params: + extra="", run: - prefix = aligners.prefix_from_bowtie2_index(input.index) - sam = output.bam.replace('.bam', '.sam') - - if c.is_paired: - assert len(input.fastq) == 2 - fastqs = '-1 {0} -2 {1} '.format(*input.fastq) - else: - assert len(input.fastq) == 1 - fastqs = '-U {0} '.format(input.fastq) - + prefix = os.path.commonprefix(input.index).rstrip(".") + sam = output.bam.replace(".bam", ".sam") + fastqs = ( + f"-1 {input.fastq[0]} -2 {input.fastq[1]}" + if is_paired + else f"-U {input.fastq}" + ) shell( "bowtie2 " "-x {prefix} " "{fastqs} " - '--no-unal ' # NOTE: suppress unaligned reads + "--no-unal " "--threads {threads} " "-S {sam} " + "{params.extra} " "> {log} 2>&1" ) @@ -254,271 +234,164 @@ rule bowtie2: rule unique: - """ - Remove multimappers - """ input: - c.patterns['bam'] + "data/chipseq_samples/{sample}/{sample}.cutadapt.bam", output: - c.patterns['unique'] + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam", threads: 1 resources: - mem_mb=gb(1), - runtime=autobump(hours=2) - shell: + mem="1g", + runtime="2h", + params: # NOTE: the quality score chosen here should reflect the scores output # by the aligner used. For example, STAR uses 255 as max mapping # quality. - 'samtools view -b -q 20 {input} > {output}' + extra="-q 20", + shell: + "samtools view -b {params.extra} {input} > {output}" rule fastq_count: - """ - Count reads in a FASTQ file - """ input: - fastq='{sample_dir}/{sample}/{sample}{suffix}.fastq.gz' + fastq="{sample_dir}/{sample}/{sample}{suffix}.fastq.gz", output: - '{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize' + "{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize", threads: 1 resources: - mem_mb=gb(1), - runtime=autobump(hours=2) + mem="1g", + runtime="2h", shell: - 'zcat {input} | echo $((`wc -l`/4)) > {output}' + "zcat {input} | echo $((`wc -l`/4)) > {output}" rule bam_count: - """ - Count reads in a BAM file - """ input: - bam='{sample_dir}/{sample}/{suffix}.bam' + bam="{sample_dir}/{sample}/{suffix}.bam", output: - '{sample_dir}/{sample}/{suffix}.bam.libsize' + "{sample_dir}/{sample}/{suffix}.bam.libsize", threads: 1 resources: - mem_mb=gb(2), - runtime=autobump(hours=2) + mem="2g", + runtime="2h", shell: - 'samtools view -c {input} > {output}' + "samtools view -c {input} > {output}" rule bam_index: - """ - Index a BAM - """ input: - bam='{prefix}.bam' + bam="{prefix}.bam", output: - bai='{prefix}.bam.bai' + bai="{prefix}.bam.bai", threads: 1 resources: - mem_mb=gb(2), - runtime=autobump(hours=2) + mem="2g", + runtime="2h", shell: - 'samtools index {input} {output}' - - -def fastq_screen_references(): - """ - Returns the Bowtie2 indexes for the configured references from the - `fastq_screen:` section of the config - """ - refs = {} - for i in config['fastq_screen']: - refs[i['label']] = c.refdict[i['organism']][i['tag']]['bowtie2'] - return refs - - -rule fastq_screen: - """ - Run fastq_screen to look for contamination from other genomes - """ - input: - **fastq_screen_references(), - fastq=r1_only(rules.cutadapt.output.fastq), - output: - txt=c.patterns['fastq_screen'] - log: - c.patterns['fastq_screen'] + '.log' - threads: 6 - resources: - mem_mb=autobump(gb=4), - runtime=autobump(hours=2) - params: subset=100000 - script: - wrapper_for('fastq_screen/wrapper.py') - - -multiqc_inputs = [ - utils.flatten(c.targets['fastqc']) + - utils.flatten(c.targets['cutadapt']) + - utils.flatten(c.targets['bam']) + - utils.flatten(c.targets['markduplicates']) + - utils.flatten(c.targets['fingerprint']) + - utils.flatten(c.targets['peaks']) + - utils.flatten(c.targets['fastq_screen']) + - utils.flatten(c.targets['plotcorrelation']) -] - -if c.is_paired: - multiqc_inputs.extend(utils.flatten(c.targets['collectinsertsizemetrics']['metrics'])) - -rule multiqc: - """ - Aggregate various QC stats and logs into a single HTML report with MultiQC - """ - # NOTE: if you add more rules and want MultiQC to pick up the output, best - # to add outputs from those rules to the inputs here. - input: - files=multiqc_inputs, - config='config/multiqc_config.yaml' - output: - c.targets['multiqc'] - log: - c.targets['multiqc'][0] + '.log' - threads: 1 - resources: - mem_mb=gb(2), - runtime=autobump(hours=2) - run: - analysis_directory = set([os.path.dirname(i) for i in input]) - outdir = os.path.dirname(c.targets['multiqc'][0]) - basename = os.path.basename(c.targets['multiqc'][0]) - shell( - 'LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 ' - 'multiqc ' - '--quiet ' - '--outdir {outdir} ' - '--force ' - '--filename {basename} ' - '--config {input.config} ' - '{analysis_directory} ' - '&> {log} ' - ) + "samtools index {input} {output}" rule markduplicates: - """ - Mark or remove PCR duplicates with Picard MarkDuplicates - """ input: - bam=c.patterns['unique'] + bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam", output: - bam=c.patterns['markduplicates']['bam'], - metrics=c.patterns['markduplicates']['metrics'] + bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", + metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics" log: - c.patterns['markduplicates']['bam'] + '.log' + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.log" threads: 1 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), - disk_mb=gb(100) + mem="32g", + disk="100g", + runtime="2h", params: - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. - java_args='-Xmx20g' - # java_args='-Xmx2g' # [TEST SETTINGS -1] + java_args="-Xmx20g", # [disable for test] + # java_args='-Xmx2g' # [enable for test] shell: - 'picard ' - '{params.java_args} ' - 'MarkDuplicates ' - 'INPUT={input.bam} ' - 'OUTPUT={output.bam} ' - 'REMOVE_DUPLICATES=true ' - 'METRICS_FILE={output.metrics} ' - 'VALIDATION_STRINGENCY=LENIENT ' - '&> {log}' + "picard " + "{params.java_args} " + "MarkDuplicates " + "INPUT={input.bam} " + "OUTPUT={output.bam} " + "REMOVE_DUPLICATES=true " + "METRICS_FILE={output.metrics} " + "VALIDATION_STRINGENCY=LENIENT " + "&> {log}" rule merge_techreps: - """ - Technical replicates are merged and then re-deduped. - - If there's only one technical replicate, its unique, nodups bam is simply - symlinked. - """ input: lambda wc: expand( - c.patterns['markduplicates']['bam'], - sample=common.get_techreps(c.sampletable, wc.label), - ) + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", + sample=utils.get_techreps(sampletable, wc.label), + ), output: - bam=c.patterns['merged_techreps'], - metrics=c.patterns['merged_techreps'] + '.metrics' + bam="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", + metrics="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.metrics", log: - c.patterns['merged_techreps'] + '.log' + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.log" threads: 1 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), - disk_mb=gb(100), + mem="32g", + disk="100g", + runtime="2h", params: - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. - java_args='-Xmx32g' - # java_args='-Xmx2g' # [TEST SETTINGS -1] + java_args="-Xmx32g", # [disable for test] + # java_args='-Xmx2g' # [enable for test] script: - wrapper_for('combos/merge_and_dedup/wrapper.py') + "../../scripts/merge_and_dedup.py" + + +if is_paired: -if c.is_paired: rule collectinsertsizemetrics: input: - bam=c.patterns['markduplicates']['bam'], + bam="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", output: - pdf=c.patterns['collectinsertsizemetrics']['pdf'], - metrics=c.patterns['collectinsertsizemetrics']['metrics'] + pdf="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.pdf", + metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics", log: - c.patterns['collectinsertsizemetrics']['metrics'] + '.log' + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics.log" threads: 1 resources: - mem_mb=gb(32), - runtime=autobump(hours=2) + mem="32g", + runtime="2h", params: - java_args='-Xmx20g' - # java_args='-Xmx2g' # [TEST SETTINGS -1] + java_args="-Xmx20g", # [disable for test] + # java_args='-Xmx2g' # [enable for test] shell: - 'picard ' - '{params.java_args} ' - 'CollectInsertSizeMetrics ' - 'I={input.bam} ' - 'O={output.metrics} ' - 'H={output.pdf} ' - '&> {log} ' + "picard " + "{params.java_args} " + "CollectInsertSizeMetrics " + "I={input.bam} " + "O={output.metrics} " + "H={output.pdf} " + "&> {log} " -rule bigwig: - """ - Create a bigwig. - See note below about normalizing! - """ +rule bigwig: input: - bam=c.patterns['merged_techreps'], - bai=c.patterns['merged_techreps'] + '.bai', + bam="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", + bai="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", output: - c.patterns['bigwig'] + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", log: - c.patterns['bigwig'] + '.log' + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig.log", threads: 1 resources: - mem_mb=gb(16), - runtime=autobump(hours=2) + mem="16g", + runtime="2h", shell: - 'bamCoverage ' - '--bam {input.bam} ' - '-o {output} ' - '-p {threads} ' - '--minMappingQuality 20 ' - '--ignoreDuplicates ' + "bamCoverage " + "--bam {input.bam} " + "-o {output} " + "-p {threads} " + "--minMappingQuality 20 " + "--ignoreDuplicates " # Can't use the CPM normalization for testing due to <1000 reads total - # in example data; keep uncommented when running in production - # [TEST SETTINGS +1] - '--normalizeUsing CPM ' - '--extendReads 300 ' - '&> {log}' + # in example data + "--normalizeUsing CPM " # [disable for test] + "--extendReads 300 " + "&> {log}" rule fingerprint: @@ -529,175 +402,110 @@ rule fingerprint: Note: uses the merged techreps. """ input: - bams=lambda wc: expand(c.patterns['merged_techreps'], label=wc.ip_label), - control=lambda wc: expand(c.patterns['merged_techreps'], label=chipseq.merged_input_for_ip(c.sampletable, wc.ip_label)), - bais=lambda wc: expand(c.patterns['merged_techreps'] + '.bai', label=wc.ip_label), - control_bais=lambda wc: expand(c.patterns['merged_techreps'] + '.bai', label=chipseq.merged_input_for_ip(c.sampletable, wc.ip_label)), + bams=lambda wc: expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=wc.ip_label), + control=lambda wc: expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", + label=chipseq.merged_input_for_ip(sampletable, wc.ip_label), + ), + bais=lambda wc: expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", + label=wc.ip_label), + control_bais=lambda wc: expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", + label=chipseq.merged_input_for_ip(sampletable, wc.ip_label), + ), output: - plot=c.patterns['fingerprint']['plot'], - raw_counts=c.patterns['fingerprint']['raw_counts'], - metrics=c.patterns['fingerprint']['metrics'] + plot="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.png", + raw_counts="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.tab", + metrics="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics", threads: 8 - log: c.patterns['fingerprint']['metrics'] + '.log' + log: + "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics.log", threads: 1 resources: - mem_mb=gb(32), - runtime=autobump(hours=2) + mem="32g", + runtime="2h", run: if len(input.control) == 0: jsdsample_arg = "" else: - jsdsample_arg = '--JSDsample ' + str(input.control) + jsdsample_arg = "--JSDsample " + str(input.control) shell( - 'plotFingerprint ' '--bamfiles {input.bams} ' - '-p {threads} ' + "plotFingerprint " + "--bamfiles {input.bams} " + "-p {threads} " # The JSDsample argument is disabled for testing as it dramatically # increases the run time. - # [TEST SETTINGS +1] - '{jsdsample_arg} ' - '--smartLabels ' - '--extendReads=300 ' - '--skipZeros ' - '--outQualityMetrics {output.metrics} ' - '--outRawCounts {output.raw_counts} ' - '--plotFile {output.plot} ' + "{jsdsample_arg} " # [disable for test] + "--smartLabels " + "--extendReads=300 " + "--skipZeros " + "--outQualityMetrics {output.metrics} " + "--outRawCounts {output.raw_counts} " + "--plotFile {output.plot} " # Default is 500k; use fewer to speed up testing: - # '--numberOfSamples 50 ' # [TEST SETTINGS ] - '&> {log} ' + # '--numberOfSamples 50 ' # [enable for test] + "&> {log} " '&& sed -i "s/NA/0.0/g" {output.metrics} ' ) -rule sicer: - """ - Run the SICER peak caller - """ - input: - ip=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.sicer_run, 'sicer', 'ip'), - ), - control=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.sicer_run, 'sicer', 'control'), - ), - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], - output: - bed=c.patterns['peaks']['sicer'] - log: - c.patterns['peaks']['sicer'] + '.log' - resources: - mem_mb=gb(16), - runtime=autobump(hours=2) - params: - block=lambda wc: chipseq.block_for_run(config, wc.sicer_run, 'sicer') - wrapper: - wrapper_for('sicer') rule macs2: - """ - Run the macs2 peak caller - """ input: - ip=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.macs2_run, 'macs2', 'ip'), - ), - control=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.macs2_run, 'macs2', 'control'), - ), - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], + ip=lambda wc: expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", + label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "ip"), + ), + control=lambda wc: expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", + label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "control"), + ), + chromsizes=rules.chromsizes.output, output: - bed=c.patterns['peaks']['macs2'] + bed="data/chipseq_peaks/macs2/{macs2_run}/peaks.bed", resources: - mem_mb=gb(16), - runtime=autobump(hours=2) + mem="16g", + runtime="2h", log: - c.patterns['peaks']['macs2'] + '.log' + "data/chipseq_peaks/macs2/{macs2_run}/peaks.bed.log", params: - block=lambda wc: chipseq.block_for_run(config, wc.macs2_run, 'macs2') - wrapper: - wrapper_for('macs2/callpeak') + block=lambda wc: chipseq.block_for_run(config, wc.macs2_run, "macs2"), + script: + "../../scripts/macs2_callpeak.py" + -# Epic2 peak caller -# See https://github.com/biocore-ntnu/epic2 rule epic2: - """ - Run the epic2 peak caller - """ input: - ip=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.epic2_run, 'epic2', 'ip'), - ), - control=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.epic2_run, 'epic2', 'control'), - ), - bai=lambda wc: # epic2 requires both .bam and .bam.bai (bam index) files (.bam.bai is not explicitly) - expand( - c.patterns['merged_techreps'] + '.bai', - label=chipseq.samples_for_run(config, wc.epic2_run, 'epic2', 'ip'), - ), - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'] + ip=lambda wc: expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", + label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "ip"), + ), + control=lambda wc: expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", + label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "control"), + ), + bai=lambda wc: expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", + label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "ip"), + ) + + expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", + label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "control"), + ), + chromsizes=rules.chromsizes.output, output: - bed=c.patterns['peaks']['epic2'] + bed="data/chipseq_peaks/epic2/{epic2_run}/peaks.bed", resources: - mem_mb=gb(16), - runtime=autobump(hours=2) + mem="16g", + runtime="2h", log: - c.patterns['peaks']['epic2'] + '.log' + "data/chipseq_peaks/epic2/{epic2_run}/peaks.bed.log" params: - block=lambda wc: chipseq.block_for_run(config, wc.epic2_run, 'epic2'), - is_paired=c.is_paired - wrapper: - wrapper_for('epic2') - - -rule spp: - """ - Run the SPP peak caller - """ - input: - ip=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.spp_run, 'spp', 'ip'), - ), - control=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.spp_run, 'spp', 'control'), - ), - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], - output: - bed=c.patterns['peaks']['spp'], - enrichment_estimates=c.patterns['peaks']['spp'] + '.est.wig', - smoothed_enrichment_mle=c.patterns['peaks']['spp'] + '.mle.wig', - rdata=c.patterns['peaks']['spp'] + '.RData' - log: - c.patterns['peaks']['spp'] + '.log' - resources: - mem_mb=gb(16), - runtime=autobump(hours=2) - params: - block=lambda wc: chipseq.block_for_run(config, wc.spp_run, 'spp'), - keep_tempfiles=False, - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. - java_args='-Xmx24g', - # java_args='-Xmx2g', # [TEST SETTINGS -1] - threads: 2 - wrapper: - wrapper_for('spp') + block=lambda wc: chipseq.block_for_run(config, wc.epic2_run, "epic2"), + is_paired=is_paired, + script: + "../../scripts/epic2.py" rule bed_to_bigbed: @@ -705,59 +513,17 @@ rule bed_to_bigbed: Convert BED to bigBed """ input: - bed='{prefix}.bed', - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'] - output: '{prefix}.bigbed' + bed="{prefix}.bed", + chromsizes=rules.chromsizes.output, + output: + "{prefix}.bigbed", resources: - mem_mb=gb(2), - runtime=autobump(hours=2) - log: '{prefix}.bigbed.log' - run: - # Based on the filename, identify the algorithm. Based on the contents, - # identify the format. - algorithm = os.path.basename(os.path.dirname(input.bed)) - kind = chipseq.detect_peak_format(input.bed) - - # bedToBigBed doesn't handle zero-size files - # bigbed is not created from epic2-generated peaks - if os.stat(input.bed).st_size == 0: - shell("touch {output}") - elif kind == 'narrowPeak': - _as = '../../include/autosql/bigNarrowPeak.as' - _type = 'bed6+4' - names=[ - 'chrom', 'chromStart', 'chromEnd', 'name', 'score', - 'strand', 'signalValue', 'pValue', 'qValue', 'peak'] - elif kind == 'broadPeak': - _as = '../../include/autosql/bigBroadPeak.as' - _type = 'bed6+3' - names=[ - 'chrom', 'chromStart', 'chromEnd', 'name', 'score', - 'strand', 'signalValue', 'pValue', 'qValue'] - elif kind == 'epic2Input': - _as = f'../../include/autosql/{kind}Peak.as' - _type = 'bed6+4' - names=[ - 'chrom', 'chromStart', 'chromEnd', 'pValue', 'score', - 'strand', 'ChIPCount', 'InputCount', 'FDR', 'log2FoldChange'] - elif kind == 'epic2NoInput': - _as = f'../../include/autosql/{kind}Peak.as' - _type = 'bed6' - names=[ - 'chrom', 'chromStart', 'chromEnd', 'ChIPCount', 'score', - 'strand'] - else: - raise ValueError("Unhandled format for {0}".format(input.bed)) - - df = pd.read_table(input.bed, index_col=False, names=names) - df['score'] = df['score'] - df['score'].min() - df['score'] = (df['score'] / df['score'].max()) * 1000 - df['score'] = df['score'].replace([np.inf, -np.inf], np.nan).fillna(0) - df['score'] = df['score'].astype(int) - df.to_csv(output[0] + '.tmp', sep='\t', index=False, header=False) - - shell('bedToBigBed -as={_as} -type={_type} {output}.tmp {input.chromsizes} {output} &> {log}') - shell('rm {output}.tmp') + mem="2g", + runtime="2h", + log: + "{prefix}.bigbed.log", + script: + "../../scripts/bed_to_bigbed.py" rule multibigwigsummary: @@ -765,25 +531,25 @@ rule multibigwigsummary: Summarize the bigWigs across genomic bins """ input: - c.targets['bigwig'] + expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=sampletable.label), output: - npz=c.targets['multibigwigsummary']['npz'], - tab=c.targets['multibigwigsummary']['tab'] + npz="data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz", + tab="data/chipseq_aggregation/deeptools/multibigwigsummary.tab", threads: 16 resources: - mem_mb=gb(16), - runtime=autobump(hours=2) + mem="16g", + runtime="2h", run: # from the input files, figure out the sample name. - labels = ' '.join([i.split('/')[-2] for i in input]) + labels = " ".join([i.split("/")[-2] for i in input]) shell( - 'multiBigwigSummary ' - 'bins ' - '-b {input} ' - '--labels {labels} ' - '--numberOfProcessors {threads} ' - '-out {output.npz} ' - '--outRawCounts {output.tab}' + "multiBigwigSummary " + "bins " + "-b {input} " + "--labels {labels} " + "--numberOfProcessors {threads} " + "-out {output.npz} " + "--outRawCounts {output.tab}" ) @@ -792,22 +558,21 @@ rule plotcorrelation: Plot a heatmap of correlations across all samples """ input: - c.targets['multibigwigsummary']['npz'] + npz="data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz", output: - heatmap=c.targets['plotcorrelation']['heatmap'], - tab=c.targets['plotcorrelation']['tab'] + tab="data/chipseq_aggregation/deeptools/plotcorrelation.tab", + heatmap="data/chipseq_aggregation/deeptools/correlation_heatmap.png", resources: - mem_mb=gb(2), - runtime=autobump(hours=2) + mem="2g", + runtime="2h", shell: - 'plotCorrelation ' - '--corData {input} ' - '--corMethod spearman ' - '--whatToPlot heatmap ' - '--plotFile {output.heatmap} ' - '--colorMap Reds ' - '--outFileCorMatrix {output.tab}' - + "plotCorrelation " + "--corData {input} " + "--corMethod spearman " + "--whatToPlot heatmap " + "--plotFile {output.heatmap} " + "--colorMap Reds " + "--outFileCorMatrix {output.tab}" # NOTE: if you're expecting negative correlation, try a divergent # colormap and setting the min/max to ensure that the colomap is # centered on zero: @@ -815,45 +580,95 @@ rule plotcorrelation: # '--zMin -1 ' # '--zMax 1 ' -if 'merged_bigwigs' in config: - rule merge_bigwigs: - """ - Merge together bigWigs as specified in the config ("merged_bigwigs" - section). - """ - input: - bigwigs=lambda wc: expand( - c.patterns['bigwig'], - label=config['merged_bigwigs'][wc.merged_bigwig_label], - ), - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], - output: - c.patterns['merged_bigwig'] - resources: - mem_mb=gb(16), - runtime=autobump(hours=2) - log: - c.patterns['merged_bigwig'] + '.log' - script: - wrapper_for('average-bigwigs/wrapper.py') rule idxstats: - """ - Run samtools idxstats on sample bams - """ input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai' + bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", + bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai", + output: + txt="data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt", + resources: + mem="16g", + runtime="2h", + log: + "data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt.log" + shell: + "samtools idxstats {input.bam} 2> {log} 1> {output.txt}" + + +rule flagstat: + input: + bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", + bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai", + output: + "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt", + resources: + mem="8g", + runtime="2h", + log: + "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt.log" + shell: + "samtools flagstat {input.bam} > {output}" + + +rule samtools_stats: + input: + bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", + bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai", output: - txt=c.patterns['samtools']['idxstats'] + "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt", resources: - mem_mb=gb(16), - runtime=autobump(hours=2) - log: - c.patterns['samtools']['idxstats'] + '.log' + mem="8g", + runtime="2h", + log: + "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt.log" + shell: + "samtools stats {input.bam} > {output}" + + +rule multiqc: + input: + expand("data/chipseq_samples/{sample}/{sample}.cutadapt.bam", sample=SAMPLES), + expand("data/chipseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip", sample=SAMPLES), + expand("data/chipseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip", sample=SAMPLES), + expand("data/chipseq_samples/{sample}/fastqc/{sample}.cutadapt.unique.nodups.bam_fastqc.zip", sample=SAMPLES), + expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=sampletable.label), + expand("data/chipseq_samples/{sample}/samtools_stats_{sample}.txt", sample=SAMPLES), + expand("data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt", sample=SAMPLES), + expand("data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt", sample=SAMPLES), + expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=sampletable.label), + expand( + "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics", + ip_label=sampletable.loc[sampletable.antibody != "input", "label"], + ), + expand( + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics", + sample=SAMPLES + ) if is_paired else [], + [v["bigbed"] for v in peaks.values()], + "data/chipseq_aggregation/deeptools/plotcorrelation.tab", + "data/chipseq_aggregation/deeptools/multibigwigsummary.tab", + config="config/multiqc_config.yaml", + output: + "data/chipseq_aggregation/multiqc.html", + log: + "data/chipseq_aggregation/multiqc.html.log", + threads: 1 + resources: + mem="2g", + runtime="2h", run: + analysis_directory = "data" + outdir = os.path.dirname(output[0]) + basename = os.path.basename(output[0]) shell( - 'samtools idxstats {input.bam} 2> {log} 1> {output.txt}' + "LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 " + "multiqc " + "--quiet " + "--outdir {outdir} " + "--force " + "--filename {basename} " + "--config {input.config} " + "{analysis_directory} " + "&> {log} " ) - -# vim: ft=python diff --git a/workflows/chipseq/chipseq_trackhub.py b/workflows/chipseq/chipseq_trackhub.py index e2bf9ecb9..d069b0158 100644 --- a/workflows/chipseq/chipseq_trackhub.py +++ b/workflows/chipseq/chipseq_trackhub.py @@ -25,7 +25,6 @@ from trackhub.upload import upload_hub, stage_hub from lib import chipseq -from lib.patterns_targets import ChIPSeqConfig ap = argparse.ArgumentParser() ap.add_argument('config', help='Main config.yaml file') @@ -53,8 +52,6 @@ genome=hub_config['hub']['genome'] ) -c = ChIPSeqConfig(config, os.path.join(os.path.dirname(args.config), 'chipseq_patterns.yaml')) - # Set up subgroups based on unique values from columns specified in the config df = pandas.read_csv(config['sampletable'], comment='#', sep='\t') cols = hub_config['subgroups']['columns'] @@ -82,8 +79,7 @@ SubGroupDefinition( name='algorithm', label='algorithm', mapping={ 'macs2': 'macs2', - 'spp': 'spp', - 'sicer': 'sicer', + 'epic2': 'epic2', 'NA': 'NA', })) @@ -146,8 +142,7 @@ def decide_color(samplename): for label in df['label'].unique(): - # ASSUMPTION: bigwig filename pattern - bigwig = c.patterns['bigwig'].format(label=label) + bigwig = f"data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig" subgroup = df[df.loc[:, 'label'] == label].to_dict('records')[0] subgroup = { diff --git a/workflows/chipseq/config/chipseq_patterns.yaml b/workflows/chipseq/config/chipseq_patterns.yaml deleted file mode 100644 index 3e44107a3..000000000 --- a/workflows/chipseq/config/chipseq_patterns.yaml +++ /dev/null @@ -1,67 +0,0 @@ -patterns_by_sample: - - fastq: 'data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz' - cutadapt: 'data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz' - bam: 'data/chipseq_samples/{sample}/{sample}.cutadapt.bam' - - fastqc: - raw: 'data/chipseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip' - cutadapt: 'data/chipseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip' - bam: 'data/chipseq_samples/{sample}/fastqc/{sample}.cutadapt.unique.nodups.bam_fastqc.zip' - - libsizes: - fastq: 'data/chipseq_samples/{sample}/{sample}_R1.fastq.gz.libsize' - cutadapt: 'data/chipseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize' - bam: 'data/chipseq_samples/{sample}/{sample}.cutadapt.bam.libsize' - unique: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam.libsize' - nodups: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.libsize' - - fastq_screen: 'data/chipseq_samples/{sample}/{sample}.cutadapt.screen.txt' - libsizes_table: 'data/chipseq_aggregation/libsizes_table.tsv' - libsizes_yaml: 'data/chipseq_aggregation/libsizes_table_mqc.yaml' - multiqc: 'data/chipseq_aggregation/multiqc.html' - unique: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam' - - markduplicates: - bam: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam' - bai: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai' - metrics: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics' - - merged_techreps: 'data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam' - - bigwig: 'data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig' - - fingerprint: - plot: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.png' - raw_counts: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.tab' - metrics: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics' - - multibigwigsummary: - npz: 'data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz' - tab: 'data/chipseq_aggregation/deeptools/multibigwigsummary.tab' - - plotcorrelation: - tab: 'data/chipseq_aggregation/deeptools/plotcorrelation.tab' - heatmap: 'data/chipseq_aggregation/deeptools/correlation_heatmap.png' - - collectinsertsizemetrics: - pdf: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.pdf' - metrics: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics' - - samtools: - idxstats: 'data/rnaseq_samples/{sample}/idxstat_{sample}.txt' - -patterns_by_peaks: - peaks: - macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed' - spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bed' - sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bed' - epic2: 'data/chipseq_peaks/epic2/{epic2_run}/peaks.bed' - bigbed: - macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bigbed' - spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bigbed' - sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bigbed' - epic2: 'data/chipseq_peaks/epic2/{epic2_run}/peaks.bigbed' - -patterns_by_aggregate: - merged_bigwig: 'data/chipseq_aggregation/merged_bigwigs/{merged_bigwig_label}.bigwig' diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml index 591fe13b2..a8d101420 100644 --- a/workflows/chipseq/config/config.yaml +++ b/workflows/chipseq/config/config.yaml @@ -39,33 +39,10 @@ chipseq: # merging step of the workflow merges and de-dupes appropriately so that the # peak callers only see BAMs with all duplicates removed. # - # The "extra" block is used to pass extra information to the peak-caller in - # a run-specific manner. Check the wrapper README for details on this. For - # example, the macs2 wrapper passes `extra` verbatim to the command line, but - # the spp wrapper handles things differently. - # # Each wrapper is built to accept either single or multiple BAMs and output # at least a BED file of peaks. # peak_calling: - - label: gaf-embryo-sicer - algorithm: sicer - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - redundancy_threshold: 1 - window_size: 200 - fragment_size: 150 - # optional user-specified override mappable genome proportion if - # specified here, SICER will use this value instead of the value specific - # to the genome build if NOT specified here, SICER will use the - # mappability value for your genome build - effective_genome_fraction: 0.75 - genome_build: dm6 - gap_size: 600 - fdr: 0.01 - - label: gaf-embryo-1 algorithm: macs2 @@ -80,23 +57,6 @@ chipseq: effective_genome_count: 7e7 extra: '--nomodel --extsize 147' - - label: gaf-embryo-1 - algorithm: spp - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - extra: - fdr: 0.3 - zthr: 4 - - - label: gaf-embryo-1-defaults - algorithm: spp - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - - label: gaf-wingdisc-pooled algorithm: macs2 ip: @@ -107,17 +67,6 @@ chipseq: - input-wingdisc-2 extra: '--nomodel --extsize 147' - - label: gaf-wingdisc-pooled - algorithm: spp - ip: - - gaf-wingdisc-1 - - gaf-wingdisc-2 - control: - - input-wingdisc-1 - # - input-wingdisc-2 - extra: - fdr: 0.5 - zthr: 4 - label: gaf-wingdisc-pooled-1 algorithm: epic2 diff --git a/workflows/colocalization/Snakefile b/workflows/colocalization/Snakefile index ac0b0413f..cb5a7991e 100644 --- a/workflows/colocalization/Snakefile +++ b/workflows/colocalization/Snakefile @@ -64,29 +64,22 @@ if ADD_CHIPSEQ_PEAKS: config['beds'][key] = fn -# Number of shufflings for GAT -# N = 100 [TEST_SETTINGS +1] -N = 10000 - targets = expand( '{outdir}/{algorithm}/{domain}/{query}/{query}_vs_{reference}.txt', outdir=config['output'], domain=config['domains'].keys(), query=config['beds'].keys(), reference=config['beds'].keys(), - algorithm=['IntervalStats', 'GAT', 'jaccard', 'fisher'], + algorithm=['IntervalStats', 'jaccard', 'fisher'], ) # Currently-supported options {algorithm: (possible values)} # IntervalStats: (f_05, f_01, f_001) -# GAT: (l2fold, fractions) # jaccard: (jaccard) # fisher: (pval) pattern = '{outdir}/{algorithm}/{domain}/{value}_heatmap.pdf' targets += expand(pattern, outdir=config['output'], domain=config['domains'], algorithm='IntervalStats', value=['f_01']) -targets += expand(pattern, outdir=config['output'], domain=config['domains'], - algorithm='GAT', value=['l2fold']) targets += expand(pattern, outdir=config['output'], domain=config['domains'], algorithm='jaccard', value=['jaccard']) targets += expand(pattern, outdir=config['output'], domain=config['domains'], @@ -216,33 +209,6 @@ rule intervalstats: df.to_csv(str(output[0]), sep='\t', index=False) -rule gat: - input: - domain=lambda wc: config['domains'][getattr(wc, 'domain')], - query=lambda wc: config['beds'][getattr(wc, 'query')], - reference=lambda wc: config['beds'][getattr(wc, 'reference')], - output: '{outdir}/GAT/{domain}/{query}/{query}_vs_{reference}.txt' - run: - shell('cut -f1,2,3 {input.query} > {output}.query.tmp') - shell('cut -f1,2,3 {input.reference} > {output}.reference.tmp') - if os.stat(output[0] + '.query.tmp').st_size == 0: - shell('touch {output}') - else: - shell( - 'gat-run.py ' - '--ignore-segment-tracks ' - '--annotations {output}.reference.tmp ' - '--segments {output}.query.tmp ' - '--workspace {input.domain} ' - '--counter nucleotide-overlap ' - '--num-samples {N} ' - '--output-counts-pattern {output}.%s.counts ' - '--log {output}.log ' - '--stdout {output} ' - ) - shell('rm {output}.query.tmp {output}.reference.tmp') - - rule heatmap: input: expand( diff --git a/workflows/external/Snakefile b/workflows/external/Snakefile index 9f8308c9d..79c3d1e2a 100644 --- a/workflows/external/Snakefile +++ b/workflows/external/Snakefile @@ -16,38 +16,14 @@ rule targets: input: list(modencode.keys()), - -rule download_chainfile: - """ - Download the chainfile we need for liftover - """ - output: 'data/dm3ToDm6.over.chain.gz' - shell: - 'wget -O- ' - 'http://hgdownload.cse.ucsc.edu/goldenPath/dm3/liftOver/dm3ToDm6.over.chain.gz ' - '> {output}' - - rule beds: """ - Download URLs, get rid of "track" lines, and then prepare them for liftover + Download URLs, get rid of "track" lines. """ - output: temporary('data/{factor}_{celltype}.bed.dm3') + output: 'data/{factor}_{celltype}.bed' run: - key = str(output[0]).replace('.dm3', '') + key = str(output[0]) url = modencode[key] - shell( - 'wget -O - "{url}" | grep -v "track" > {output}') - -rule liftover: - """ - Perform the liftover - """ - input: - bed='{prefix}.dm3', - chainfile=rules.download_chainfile.output - output: '{prefix}' - shell: - 'liftOver {input.bed} {input.chainfile} {output} {output}.unmapped' + shell('wget -O - "{url}" | grep -v "track" > {output}') # vim: ft=python diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index d6bc9d0f6..682f1bfe5 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -1,321 +1,277 @@ import os import sys -sys.path.insert(0, srcdir('../..')) -import gzip -import yaml -import importlib -import tempfile import pandas -from snakemake.utils import makedirs -from lib.imports import resolve_name -from lib import utils -from lib.utils import autobump, gb, hours -from lib import aligners, helpers -from lib import common -# Note: when running this workflow on its own (say, to generate all references -# ahead of time) you wil need to provide a config file from the command line. -# -# Otherwise, this file is expected to be `include:`ed into other workflows, -# which will have their own config files. +sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") +from lib import utils -config = common.load_config(config) +REFERENCES = config.get("reference_dir", "../../references") -references_dir = common.get_references_dir(config) -refdict, conversion_kwargs = common.references_dict(config) -makedirs([references_dir, os.path.join(references_dir, 'logs')]) +def default_postprocess(origfn, newfn): + shell("mv {origfn} {newfn}") -localrules: symlink_fasta_to_index_dir -wildcard_constraints: - _type="genome|transcriptome|annotation", - _ext="fasta|gtf" +rule fasta: + output: + temporary(f"{REFERENCES}/genome.fa.gz"), + log: + f"{REFERENCES}/logs/genome.fa.gz.log", + resources: + mem_mb="4g", + runtime="2h", + run: + utils.download_and_postprocess( + urls=config["fasta"]["url"], + postprocess=config["fasta"].get("postprocess", None), + outfile=output[0], + log=log, + ) -rule all_references: - input: utils.flatten(refdict) +rule gtf: + output: + temporary(f"{REFERENCES}/annotation.gtf.gz"), + log: + f"{REFERENCES}/logs/annotation.gtf.gz.log", + resources: + mem="4g", + runtime="2h", + run: + utils.download_and_postprocess( + urls=config["gtf"]["url"], + postprocess=config["gtf"].get("postprocess", None), + outfile=output[0], + log=log, + ) -rule download_and_process: - """Downloads the configured URL, applies any configured post-processing, and - saves the resulting gzipped file to *.fasta.gz or *.gtf.gz. - """ +rule rrna: output: - temporary('{references_dir}/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}.gz') + temporary(f"{REFERENCES}/rrna.fa.gz"), + log: + f"{REFERENCES}/logs/rrna.fa.gz.log", + resources: + mem="4g", + runtime="2h", run: - common.download_and_postprocess(output[0], config, wildcards.organism, wildcards.tag, wildcards._type) + utils.download_and_postprocess( + urls=config["rrna"]["url"], + postprocess=config["rrna"].get("postprocess", None), + outfile=output[0], + log=log, + ) rule unzip: - """Generic rule to unzip files as needed, for example when building - indexes. - """ input: - rules.download_and_process.output + f"{REFERENCES}/{{prefix}}.gz", output: - protected('{references_dir}/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}') - wildcard_constraints: - _type="genome|annotation" - log: - '{references_dir}/logs/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}.log' - shell: 'gunzip -c {input} > {output}' + f"{REFERENCES}/{{prefix}}", + resources: + mem="4g", + runtime="2h", + shell: + "gunzip -c {input} > {output}" rule bowtie2_index: - """ - Build bowtie2 index - """ input: - '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' + f"{REFERENCES}/{{label}}.fa", output: - protected(aligners.bowtie2_index_from_prefix('{references_dir}/{organism}/{tag}/genome/bowtie2/{organism}_{tag}')) + multiext( + f"{REFERENCES}/bowtie2/{{label}}", + ".1.bt2", + ".2.bt2", + ".3.bt2", + ".4.bt2", + ".rev.1.bt2", + ".rev.2.bt2", + ".fa", + ), log: - '{references_dir}/logs/{organism}/{tag}/genome/bowtie2/{organism}_{tag}.log' + f"{REFERENCES}/logs/bowtie2_{{label}}.log", resources: - runtime=autobump(hours=8), - mem_mb=autobump(gb=32), - disk_mb=autobump(gb=50) + mem="32g", + disk="50g", + runtime="8h", + threads: 8 run: - prefix = aligners.prefix_from_bowtie2_index(output) - shell( - 'bowtie2-build ' - '{input} ' - '{prefix} ' - '&> {log}') + index = os.path.commonprefix(output).rstrip(".") + shell("bowtie2-build" " --threads {threads}" " {input}" " {index}" " &> {log}") + utils.make_relative_symlink(input[0], output[-1]) rule star_index: input: - fasta='{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta', - gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf', + fasta=f"{REFERENCES}/genome.fa", + gtf=f"{REFERENCES}/annotation.gtf", output: - protected('{references_dir}/{organism}/{tag}/genome/star/{organism}_{tag}/Genome') + f"{REFERENCES}/star/Genome", log: - '{references_dir}/{organism}/{tag}/genome/star/{organism}_{tag}/Genome.log' - threads: - 8 + f"{REFERENCES}/logs/star.log", + threads: 8 resources: - runtime=autobump(hours=8), - mem_mb=gb(64) + mem="64g", + runtime="8h", run: genomedir = os.path.dirname(output[0]) - shell('rm -r {genomedir}') - shell('mkdir -p {genomedir}') + shell("rm -r {genomedir}") + shell("mkdir -p {genomedir}") shell( - 'STAR ' - '--runMode genomeGenerate ' - '--runThreadN {threads} ' - '--genomeDir {genomedir} ' - '--genomeFastaFiles {input.fasta} ' - + "STAR " + "--runMode genomeGenerate " + "--runThreadN {threads} " + "--genomeDir {genomedir} " + "--genomeFastaFiles {input.fasta} " # NOTE: GTF is optional - '--sjdbGTFfile {input.gtf} ' - + "--sjdbGTFfile {input.gtf} " # NOTE: STAR docs say that 100 should work well. - '--sjdbOverhang 100 ' - + "--sjdbOverhang 100 " # NOTE: for small genomes, may need to scale this down to # min(14, log2(GenomeLength) / 2 - 1) # --genomeSAindexNbases 14 - '&> {log}' + "&> {log}" ) # STAR writes a hard-coded Log.out file to the current working # directory. So put that on the end of the log file for the rule and # then clean up. - shell('cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out') + shell("cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out") + shell("ln -s {input.fasta} {genomedir}") rule hisat2_index: - """ - Build HISAT2 index - """ input: - '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' + f"{REFERENCES}/genome.fa", output: - protected(aligners.hisat2_index_from_prefix('{references_dir}/{organism}/{tag}/genome/hisat2/{organism}_{tag}')) + multiext( + f"{REFERENCES}/hisat2/genome", + ".1.ht2", + ".2.ht2", + ".3.ht2", + ".4.ht2", + ".5.ht2", + ".6.ht2", + ".7.ht2", + ".8.ht2", + ".fa", + ), log: - '{references_dir}/logs/{organism}/{tag}/genome/hisat2/{organism}_{tag}.log' - resources: - runtime=autobump(hours=8), - mem_mb=gb(32), - disk_mb=gb(50) - run: - prefix = aligners.prefix_from_hisat2_index(output) - shell( - 'hisat2-build ' - '{input} ' - '{prefix} ' - '&> {log}') - - -rule symlink_fasta_to_index_dir: - """Aligners often want the reference fasta in the same dir as the index, so - this makes the appropriate symlink - """ - input: - fasta='{references_dir}/{organism}/{tag}/{_type}/{organism}_{tag}.fasta' - output: - '{references_dir}/{organism}/{tag}/{_type}/{index}/{organism}_{tag}.fasta' + f"{REFERENCES}/logs/hisat2.log", resources: - runtime=hours(1) - log: - '{references_dir}/logs/{organism}/{tag}/{_type}/{index}/{organism}_{tag}.fasta.log' + mem="32g", + disk="50g", + runtime="8h", + threads: 8 run: - utils.make_relative_symlink(input[0], output[0]) + index = os.path.commonprefix(output).rstrip(".") + shell("hisat2-build" " --threads {threads}" " {input}" " {index}" " &> {log}") + shell("ln -s {input} {output[-1]}") rule transcriptome_fasta: input: - fasta='{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta', - gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' + fasta=f"{REFERENCES}/genome.fa", + gtf=f"{REFERENCES}/annotation.gtf", output: - protected('{references_dir}/{organism}/{tag}/transcriptome/{organism}_{tag}.fasta') + f"{REFERENCES}/transcriptome.fa", resources: - runtime=hours(1) + mem="4g", + runtime="2h", shell: - 'gffread {input.gtf} -w {output} -g {input.fasta}' + "gffread {input.gtf} -w {output} -g {input.fasta}" rule salmon_index: - "Build salmon index" - output: - protected('{references_dir}/{organism}/{tag}/transcriptome/salmon/{organism}_{tag}/versionInfo.json') input: - fasta='{references_dir}/{organism}/{tag}/transcriptome/{organism}_{tag}.fasta' + f"{REFERENCES}/transcriptome.fa", + output: + f"{REFERENCES}/salmon/versionInfo.json", log: - '{references_dir}/logs/{organism}/{tag}/transcriptome/salmon/{organism}_{tag}.log' + f"{REFERENCES}/logs/salmon.log", params: - outdir='{references_dir}/{organism}/{tag}/transcriptome/salmon/{organism}_{tag}' + outdir=f"{REFERENCES}/salmon", resources: - mem_mb=gb(32), - runtime=hours(2) - shell: - 'salmon index ' - '--transcripts {input.fasta} ' - '--index {params.outdir} ' - '&> {log}' + mem="32g", + runtime="2h", + run: + outdir = os.path.dirname(output[0]) + shell("salmon index " "--transcripts {input} " "--index {outdir} " "&> {log}") rule kallisto_index: - "Build kallisto index" output: - index=protected('{references_dir}/{organism}/{tag}/transcriptome/kallisto/{organism}_{tag}/transcripts.idx') + f"{REFERENCES}/kallisto/transcripts.idx", input: - fasta='{references_dir}/{organism}/{tag}/transcriptome/{organism}_{tag}.fasta' + f"{REFERENCES}/genome.fa", log: - '{references_dir}/logs/{organism}/{tag}/transcriptome/kallisto/{organism}_{tag}.log' + f"{REFERENCES}/logs/kallisto.log", resources: - runtime=hours(2), - mem_mb=gb(32), + mem="32g", + runtime="2h", shell: - 'kallisto index ' - '--index {output.index} ' - '{input.fasta} ' - '&> {log}' + "kallisto index " + "--index {output} " + "{input} " + "&> {log}" rule conversion_refflat: - """Converts a GTF into refFlat format - """ input: - '{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' + f"{REFERENCES}/annotation.gtf", output: - protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.refflat') + f"{REFERENCES}/annotation.refflat", log: - '{references_dir}/logs/{organism}/{tag}/annotation/{organism}_{tag}.refflat.log' + f"{REFERENCES}/logs/annotation.refflat.log", resources: - runtime=hours(2), - mem_mb=gb(2) + mem="2g", + runtime="2h", shell: - 'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp ' - '''&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} ''' - '&& rm {output}.tmp ' + "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp " + """&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} """ + "&& rm {output}.tmp " rule conversion_bed12: input: - '{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' + f"{REFERENCES}/annotation.gtf", output: - protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.bed12') + f"{REFERENCES}/annotation.bed12", resources: - runtime=hours(2), - mem_mb=gb(2) + mem="2g", + runtime="2h", shell: - 'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp ' - '&& genePredToBed {output}.tmp {output} ' - '&& rm {output}.tmp' - -rule conversion_gffutils: - """Converts a GTF into a gffutils sqlite3 database - """ - input: - gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' - output: - db=protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf.db') - log: - '{references_dir}/logs/{organism}/{tag}/annotation/{organism}_{tag}.gtf.db.log' - resources: - runtime=hours(2), - mem_mb=gb(4) - run: - import gffutils - kwargs = conversion_kwargs[output[0]] - fd, tmpdb = tempfile.mkstemp(suffix='.db', prefix='gffutils_') - db = gffutils.create_db(data=input.gtf, dbfn=tmpdb, **kwargs) - shell('mv {tmpdb} {output.db}') + "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp " + "&& genePredToBed {output}.tmp {output} " + "&& rm {output}.tmp" rule chromsizes: - """Creates a chromsizes table from fasta - """ input: - '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' + f"{REFERENCES}/genome.fa", output: - protected('{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.chromsizes') + f"{REFERENCES}/genome.chromsizes", log: - '{references_dir}/logs/{organism}/{tag}/genome/{organism}_{tag}.fasta.log' + f"{REFERENCES}/logs/genome.chromsizes.log", params: # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. - java_args='-Xmx20g' + # and/or it matches the resources you're requesting + java_args="-Xmx20g", # java_args='-Xmx2g' # [TEST SETTINGS -1] resources: - mem_mb=gb(24), - runtime=hours(2) + mem="24g", + runtime="2h", shell: - 'export LC_COLLATE=C; ' - 'rm -f {output}.tmp ' - '&& picard ' - '{params.java_args} ' - 'CreateSequenceDictionary R={input} O={output}.tmp &> {log} ' + "export LC_COLLATE=C; " + "rm -f {output}.tmp " + "&& picard " + "{params.java_args} " + "CreateSequenceDictionary R={input} O={output}.tmp &> {log} " '&& grep "^@SQ" {output}.tmp ' - '''| awk '{{print $2, $3}}' ''' + """| awk '{{print $2, $3}}' """ '| sed "s/SN://g;s/ LN:/\\t/g" ' - '| sort -k1,1 > {output} ' - '&& rm -f {output}.tmp ' - - -rule genelist: - """Creates a list of unique gene names in the GTF - """ - input: - gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' - output: - protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.genelist') - resources: - runtime=hours(1), - mem_mb=gb(2) - run: - attribute = conversion_kwargs[output[0]]['gene_id'] - import gffutils - genes = set() - for feature in gffutils.DataIterator(input.gtf): - genes.update(feature.attributes[attribute]) - with open(output[0], 'w') as fout: - for feature in sorted(list(set(genes))): - fout.write(feature + '\n') + "| sort -k1,1 > {output} " + "&& rm -f {output}.tmp " rule mappings: @@ -323,14 +279,16 @@ rule mappings: Creates gzipped TSV mapping between attributes in the GTF. """ input: - gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' + gtf=f"{REFERENCES}/annotation.gtf", output: - protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.mapping.tsv.gz') + f"{REFERENCES}/annotation.mapping.tsv.gz", params: - include_featuretypes=lambda wildcards, output: conversion_kwargs[output[0]].get('include_featuretypes', []) + include_featuretypes=lambda wildcards, output: conversion_kwargs[ + output[0] + ].get("include_featuretypes", []), resources: - runtime=hours(2), - mem_mb=gb(2) + mem="2g", + runtime="2h", run: import gffutils @@ -350,7 +308,7 @@ rule mappings: continue d = dict(f.attributes) - d['__featuretype__'] = ft + d["__featuretype__"] = ft res.append(d) df = pandas.DataFrame(res) @@ -359,9 +317,7 @@ rule mappings: # include_featuretypes settings, this may take a while. df = df.drop_duplicates() - df.to_csv(output[0], sep='\t', index=False, compression='gzip') + df.to_csv(output[0], sep="\t", index=False, compression="gzip") # Restore original setting gffutils.constants.always_return_list = orig_setting - -# vim: ft=python diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml deleted file mode 100644 index 49618dcd0..000000000 --- a/workflows/references/config/config.yaml +++ /dev/null @@ -1,6 +0,0 @@ -references_dir: 'references_dir' - -# See the reference config files in the top level of the repo, -# include/reference_configs, for inspiration for more species. -include_references: - - '../../include/reference_configs/test.yaml' diff --git a/workflows/references/run_test.sh b/workflows/references/run_test.sh deleted file mode 100755 index 7aacb413c..000000000 --- a/workflows/references/run_test.sh +++ /dev/null @@ -1,3 +0,0 @@ -set -e -python -m doctest ../../ci/preprocessor.py -python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test "$@" diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index f9a11c4d8..7247bbc2b 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -1,244 +1,222 @@ import sys - -sys.path.insert(0, srcdir('../..')) import os -from textwrap import dedent import yaml -import tempfile import pandas as pd -from lib import common, utils, helpers, aligners -from lib.utils import autobump, gb, hours -from lib.patterns_targets import RNASeqConfig -# ---------------------------------------------------------------------------- -# -# Search for the string "NOTE:" to look for points of configuration that might -# be helpful for your experiment. -# -# ---------------------------------------------------------------------------- +sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") +from lib import utils -if not workflow.overwrite_configfiles: - configfile: 'config/config.yaml' -config = common.load_config(config) +configfile: "config/config.yaml" -include: '../references/Snakefile' -# Verify configuration of config and sampletable files -helpers.preflight(config) +REFERENCES = config.get("reference_dir", "../../references") +sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") +sampletable = sampletable.set_index(sampletable.columns[0], drop=False) +is_paired = utils.detect_layout(sampletable) == "PE" +n = ["1", "2"] if is_paired else ["1"] +SAMPLES = sampletable.index -c = RNASeqConfig(config, config.get('patterns', 'config/rnaseq_patterns.yaml')) -SAMPLES = c.sampletable.iloc[:, 0].values wildcard_constraints: - n = '[1,2]', - sample = '|'.join(SAMPLES) - - -def wrapper_for(path): - return 'file:' + os.path.join('../..','wrappers', 'wrappers', path) - - -# ---------------------------------------------------------------------------- -# RULES -# ---------------------------------------------------------------------------- - -# See "patterns and targets" in the documentation for what's going on here. -final_targets = utils.flatten(( - utils.flatten(c.targets['fastqc']), - [c.targets['fastq_screen']], - [c.targets['rrna_percentages_table']], - [c.targets['multiqc']], - utils.flatten(c.targets['featurecounts']), - utils.flatten(c.targets['markduplicates']), - utils.flatten(c.targets['salmon']), - utils.flatten(c.targets['kallisto']), - utils.flatten(c.targets['preseq']), - utils.flatten(c.targets['rseqc']), - utils.flatten(c.targets['collectrnaseqmetrics']), - utils.flatten(c.targets['bigwig']), - utils.flatten(c.targets['samtools']), -)) - -if config.get('merged_bigwigs', None): - final_targets.extend(utils.flatten(c.targets['merged_bigwig'])) + n="[1,2]", + sample="|".join(SAMPLES), -def render_r1_r2(pattern, r1_only=False): - return expand(pattern, sample='{sample}', n=c.n) +localrules: + symlinks, + symlink_targets, -def r1_only(pattern): - return expand(pattern, sample='{sample}', n=1) -rule targets: - """ - Final targets to create - """ - input: final_targets +rule all: + input: + "data/rnaseq_aggregation/multiqc.html", -if 'orig_filename' in c.sampletable.columns: - localrules: symlinks, symlink_targets +include: "../references/Snakefile" - # Convert the sampletable to be indexed by the first column, for - # convenience in generating the input/output filenames. - _st = c.sampletable.set_index(c.sampletable.columns[0]) - def orig_for_sample(wc): - """ - Given a sample, returns either one or two original fastq files - depending on whether the library was single- or paired-end. - """ - if c.is_paired: - return _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] - return _st.loc[wc.sample, ['orig_filename']] +if utils.detect_sra(sampletable): + sampletable["orig_filename"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1 + ) + if is_paired: + sampletable["orig_filename_R2"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + sample=SAMPLES, + n=2, + ) - rule symlinks: - """ - Symlinks files over from original filename - """ - input: - orig_for_sample + rule fastq_dump: output: - render_r1_r2(c.patterns['fastq']) - threads: 1 + fastq=expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + n=n, + allow_missing=True, + ), + log: + "original_data/sra_samples/{sample}/{sample}.fastq.gz.log", + params: + is_paired=is_paired, + # extra="-X 100000", # [enable for test] resources: - mem_mb=100, - runtime=10, + mem="1g", + disk="1g", + runtime="2h", run: - assert len(output) == len(input), (input, output) - for src, linkname in zip(input, output): - utils.make_relative_symlink(src, linkname) - + srr = sampletable.loc[wildcards.sample, "Run"] + extra = params.get("extra", "") + if is_paired: + shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") + shell("mv {srr}_1.fastq.gz {output[0]}") + shell("mv {srr}_2.fastq.gz {output[1]}") + else: + shell( + "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp" + ) + shell("mv {output[0]}.tmp {output[0]}") - rule symlink_targets: - input: c.targets['fastq'] -if 'Run' in c.sampletable.columns and sum(c.sampletable['Run'].str.startswith('SRR')) > 0: +rule symlinks: + input: + lambda wc: ( + sampletable.loc[wc.sample, ["orig_filename", "orig_filename_R2"]] + if is_paired + else sampletable.loc[wc.sample, ["orig_filename"]] + ), + output: + expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), + threads: 1 + resources: + mem="1g", + runtime="10m", + run: + assert len(output) == len(input), (input, output) + for src, linkname in zip(input, output): + utils.make_relative_symlink(src, linkname) - # Convert the sampletable to be indexed by the first column, for - # convenience in generating the input/output filenames. - _st = c.sampletable.set_index(c.sampletable.columns[0]) - rule fastq_dump: - output: - fastq=render_r1_r2(c.patterns['fastq']) - log: - r1_only(c.patterns['fastq'])[0] + '.log' - params: - is_paired=c.is_paired, - sampletable=_st, - # limit = 100000, # [TEST SETTINGS] - resources: - mem_mb=gb(1), - disk_mb=autobump(gb=1), - runtime=autobump(hours=2) - conda: - '../../wrappers/wrappers/fastq-dump/environment.yaml' - script: - wrapper_for('fastq-dump/wrapper.py') +rule symlink_targets: + input: + expand( + "data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=n + ), -# This can be set at the command line with --config strand_check_reads=1000 -config.setdefault('strand_check_reads', 1e5) +# Optionally run ``snakemake strand_check`` to do a preliminary run on +# automatically-subset data to evaluate strandedness. rule sample_strand_check: input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['fastq']), - index=[c.refdict[c.organism][config['aligner']['tag']]['bowtie2']], - bed12=c.refdict[c.organism][config['gtf']['tag']]['bed12'] + fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), + index=expand(rules.bowtie2_index.output, label="genome"), + bed12=rules.conversion_bed12.output, output: - strandedness=c.patterns['strand_check']['tsv'], - bam=temporary(c.patterns['strand_check']['bam']), - idx=temporary(c.patterns['strand_check']['bam'] + '.bai'), - fastqs=temporary(render_r1_r2(c.patterns['strand_check']['fastq'])), + strandedness="strand_check/{sample}/{sample}.strandedness", + bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"), + bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"), + fastqs=temporary( + expand( + "strand_check/{sample}/{sample}_R{n}.strandedness.fastq", + n=n, + allow_missing=True, + ) + ), log: - c.patterns['strand_check']['tsv'] + '.log' + "strand_check/{sample}/{sample}.strandedness.log", threads: 6 resources: - mem_mb=gb(8), - runtime=autobump(hours=2) + mem="8g", + runtime="2h", run: - prefix = aligners.prefix_from_bowtie2_index(input.index) - nreads = int(config['strand_check_reads']) * 4 - if c.is_paired: - assert len(input.fastq) == 2 - assert len(output.fastqs) == 2 - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}') - fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} ' + prefix = os.path.commonprefix(input.index).rstrip(".") + nreads = int(1e5 * 4) + if is_paired: + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" + ) + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}" + ) + fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} " else: - assert len(input.fastq) == 1 - assert len(output.fastqs) == 1 - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') - fastqs = f'-U {output.fastqs[0]} ' + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" + ) + fastqs = f"-U {output.fastqs[0]} " shell( "bowtie2 " "-x {prefix} " "{fastqs} " - '--no-unal ' + "--no-unal " "--threads {threads} 2> {log} " "| samtools view -Sb - " "| samtools sort - -o {output.bam} " ) shell("samtools index {output.bam}") shell( - 'infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}' + "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}" ) + rule strand_check: input: - expand(c.patterns['strand_check']['tsv'], sample=SAMPLES) + expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES), output: - html='strand_check/strandedness.html', - filelist=temporary('strand_check/filelist') + html="strand_check/strandedness.html", + filelist=temporary("strand_check/filelist"), log: - 'strand_check/strandedness.log' + "strand_check/strandedness.log", resources: - mem_mb=gb(1), - runtime=autobump(hours=2) + mem="1g", + runtime="2h", run: - with open(output.filelist, 'w') as fout: - for i in input: - fout.write(i + '\n') + with open(output.filelist, "w") as fout: + for i in input: + fout.write(i + "\n") shell( - 'multiqc ' - '--force ' - '--module rseqc ' - '--file-list {output.filelist} ' - '--filename {output.html} &> {log}' + "multiqc " + "--force " + "--module rseqc " + "--file-list {output.filelist} " + "--filename {output.html} &> {log}" ) rule cutadapt: - """ - Run cutadapt - """ input: - fastq=render_r1_r2(c.patterns['fastq']) + fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), output: - fastq=render_r1_r2(c.patterns['cutadapt']) + fastq=expand( + "data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.cutadapt.fastq.gz", n=n + ), log: - render_r1_r2(c.patterns['cutadapt'])[0] + '.log' + "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", threads: 6 resources: - mem_mb=gb(2), - runtime=autobump(hours=2) + mem="2g", + runtime="2h", + params: + extra=( + ( + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + ) + + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " + if is_paired + else "" + ), run: - - # NOTE: Change cutadapt params here - if c.is_paired: + if is_paired: shell( "cutadapt " "-o {output[0]} " "-p {output[1]} " - "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " - "--nextseq-trim 20 " - "--overlap 6 " - '-j {threads} ' - '--minimum-length 25 ' + "-j {threads} " + "{params.extra} " "{input.fastq[0]} " "{input.fastq[1]} " "&> {log}" @@ -247,68 +225,81 @@ rule cutadapt: shell( "cutadapt " "-o {output[0]} " - "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - "--nextseq-trim 20 " - "--overlap 6 " - '-j {threads} ' - '--minimum-length 25 ' + "-j {threads} " + "{params.extra} " "{input.fastq[0]} " "&> {log}" ) rule fastqc: - """ - Run FastQC - """ input: - '{sample_dir}/{sample}/{sample}{suffix}' - threads: - 6 + "data/rnaseq_samples/{sample}/{sample}{suffix}", + threads: 1 output: - html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html', - zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip', + html="data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.html", + zip="data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.zip", resources: - mem_mb=gb(8), - runtime=autobump(hours=2) - script: - wrapper_for('fastqc/wrapper.py') + mem="8g", + runtime="2h", + log: + "data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log", + run: + outdir = os.path.dirname(output.html) or "." + shell( + "fastqc " + "--noextract " + "--quiet " + "--outdir {outdir} " + "{input} " + "2> {log} " + ) + outfile = os.path.basename(input[0]) + for s in [".fastq", ".fq", ".gz", ".bam"]: + outfile = outfile.replace(s, "") + out_zip = os.path.join(outdir, outfile + "_fastqc.zip") + if not os.path.abspath(out_zip) == os.path.abspath(output.zip): + shell("mv {out_zip} {output.zip}") + out_html = os.path.join(outdir, outfile + "_fastqc.html") + if not os.path.abspath(out_html) == os.path.abspath(output.html): + shell("mv {out_html} {output.html}") + +if config["aligner"] == "hisat2": -if config['aligner']['index'] == 'hisat2': rule hisat2: - """ - Map reads with HISAT2 - """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['aligner']['tag']]['hisat2']] + fastq=rules.cutadapt.output, + index=rules.hisat2_index.output, output: - bam=temporary(c.patterns['bam']) + bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), log: - c.patterns['bam'] + '.log' - threads: 6 + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log", + threads: 16 resources: - mem_mb=gb(32), - runtime=autobump(hours=8) + mem="32g", + runtime="8h", + params: + extra="", run: - prefix = aligners.prefix_from_bowtie2_index(input.index) - sam = output.bam.replace('.bam', '.sam') + prefix = os.path.commonprefix(input.index).rstrip(".") + sam = output.bam.replace(".bam", ".sam") - if c.is_paired: + if is_paired: assert len(input.fastq) == 2 - fastqs = '-1 {0} -2 {1} '.format(*input.fastq) + fastqs = "-1 {0} -2 {1} ".format(*input.fastq) else: assert len(input.fastq) == 1 - fastqs = '-U {0} '.format(input.fastq) + fastqs = "-U {0} ".format(input.fastq) shell( "hisat2 " "-x {prefix} " "{fastqs} " - '--no-unal ' + "--no-unal " "--threads {threads} " "-S {sam} " + "{params.extra} " "> {log} 2>&1" ) @@ -318,109 +309,127 @@ if config['aligner']['index'] == 'hisat2': "&& rm {sam}" ) -if config['aligner']['index'].startswith('star'): + +if config["aligner"].startswith("star"): + if os.getenv("TMPDIR"): + tmpdir_arg = "--outTmpDir $TMPDIR/star " + else: + tmpdir_arg = "" # STAR can be run in 1-pass or 2-pass modes. Since we may be running it # more than once in almost the same way, we pull out the shell command here # and use it below. STAR_CMD = ( - 'STAR ' - '--runThreadN {threads} ' - '--genomeDir {genomedir} ' - '--readFilesIn {input.fastq} ' - '--readFilesCommand zcat ' - '--outFileNamePrefix {prefix} ' - + "STAR " + "--runThreadN {threads} " + "--genomeDir {genomedir} " + "--readFilesIn {input.fastq} " + "--readFilesCommand zcat " + "--outFileNamePrefix {prefix} " + "{tmpdir_arg} " + "{params.extra} " + ) + STAR_PARAMS = ( # NOTE: The STAR docs indicate that the following parameters are # standard options for ENCODE long-RNA-seq pipeline. Comments are from # the STAR docs. - '--outFilterType BySJout ' # reduces number of spurious junctions - '--outFilterMultimapNmax 20 ' # if more than this many multimappers, consider unmapped - '--alignSJoverhangMin 8 ' # min overhang for unannotated junctions - '--alignSJDBoverhangMin 1 ' # min overhang for annotated junctions - '--outFilterMismatchNmax 999 ' # max mismatches per pair - '--outFilterMismatchNoverReadLmax 0.04 ' # max mismatches per pair relative to read length - '--alignIntronMin 20 ' # min intron length - '--alignIntronMax 1000000 ' # max intron length - '--alignMatesGapMax 1000000 ' # max distance between mates - '--outSAMunmapped None ' # do not report aligned reads in output + "--outFilterType BySJout " # reduces number of spurious junctions + "--outFilterMultimapNmax 20 " # if more than this many multimappers, consider unmapped + "--alignSJoverhangMin 8 " # min overhang for unannotated junctions + "--alignSJDBoverhangMin 1 " # min overhang for annotated junctions + "--outFilterMismatchNmax 999 " # max mismatches per pair + "--outFilterMismatchNoverReadLmax 0.04 " # max mismatches per pair relative to read length + "--alignIntronMin 20 " # min intron length + "--alignIntronMax 1000000 " # max intron length + "--alignMatesGapMax 1000000 " # max distance between mates + "--outSAMunmapped None " # do not report aligned reads in output ) - logfile_extensions = ['Log.progress.out', 'Log.out', 'Log.final.out', 'Log.std.out'] + logfile_extensions = ["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"] -if config['aligner']['index'] == 'star': +if config["aligner"] == "star": rule star: - """ - Align with STAR (1-pass mode) - """ + "Align with STAR (1-pass mode)" input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['aligner']['tag']]['star']], - annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'], + fastq=rules.cutadapt.output, + index=rules.star_index.output, + annotation=f"{REFERENCES}/annotation.gtf", output: - bam=temporary(c.patterns['bam']), - sjout=temporary(c.patterns['bam'].replace('.bam', '.star.SJ.out.tab')), + bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), + sjout=temporary( + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab" + ), log: - c.patterns['bam'].replace('.bam', '.star.bam.log') + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log", threads: 16 resources: - mem_mb=gb(64), - runtime=autobump(hours=8) + mem="64g", + runtime="8h", + disk="80g", + params: + extra=STAR_PARAMS, run: genomedir = os.path.dirname(input.index[0]) outdir = os.path.dirname(output[0]) - prefix = output.bam.replace('.bam', '.star.') + prefix = output.bam.replace(".bam", ".star.") shell( - STAR_CMD + ( - '--outSAMtype BAM SortedByCoordinate ' - '--outStd BAM_SortedByCoordinate > {output.bam} ' - '2> {log} ' + STAR_CMD + + ( + "--outSAMtype BAM SortedByCoordinate " + "--outStd BAM_SortedByCoordinate > {output.bam} " + "2> {log} " ) ) # move various hard-coded log files to log directory - logfiles = expand(prefix + '{ext}', ext=logfile_extensions) - shell('mkdir -p {outdir}/star_logs ' - '&& mv {logfiles} {outdir}/star_logs') + logfiles = expand(prefix + "{ext}", ext=logfile_extensions) + shell( + "mkdir -p {outdir}/star_logs " "&& mv {logfiles} {outdir}/star_logs" + ) + -if config['aligner']['index'] == 'star-twopass': +if config["aligner"] == "star-twopass": rule star_pass1: - """ - First pass of alignment with STAR to get the junctions - """ + "First pass of alignment with STAR to get the junctions" input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['aligner']['tag']]['star']], - annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'], + fastq=rules.cutadapt.output, + index=rules.star_index.output, + annotation=f"{REFERENCES}/annotation.gtf", output: - sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab')), + sjout=temporary( + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab" + ), log: - c.patterns['bam'].replace('.bam', '.star-pass1.bam.log') + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass1.log", threads: 16 resources: - mem_mb=gb(64), - runtime=autobump(hours=8) + mem="64g", + runtime="8h", + disk="80g", + params: + extra=STAR_PARAMS, run: genomedir = os.path.dirname(input.index[0]) outdir = os.path.dirname(output[0]) - prefix = output.sjout.replace('SJ.out.tab', '') + prefix = output.sjout.replace("SJ.out.tab", "") shell( - STAR_CMD + - ( + STAR_CMD + + ( # In this first pass, we don't actually care about the # alignment -- just the detected junctions. So we output # the SAM to /dev/null. - '--outStd SAM > /dev/null ' - '2> {log} ' + "--outStd SAM > /dev/null " + "2> {log} " ) ) # move various hard-coded log files to log directory - logfiles = expand(prefix + '{ext}', ext=logfile_extensions) - shell('mkdir -p {outdir}/star-pass1_logs ' - '&& mv {logfiles} {outdir}/star-pass1_logs') - + logfiles = expand(prefix + "{ext}", ext=logfile_extensions) + shell( + "mkdir -p {outdir}/star-pass1_logs " + "&& mv {logfiles} {outdir}/star-pass1_logs" + ) rule star_pass2: """ @@ -428,71 +437,88 @@ if config['aligner']['index'] == 'star-twopass': samples to get the final BAM """ input: - sjout=expand(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES), - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['aligner']['tag']]['star']], - annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'], + fastq=rules.cutadapt.output, + index=rules.star_index.output, + annotation=f"{REFERENCES}/annotation.gtf", + sjout=expand(rules.star_pass1.output, sample=SAMPLES), output: - bam=temporary(c.patterns['bam']), - sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass2.SJ.out.tab')), + bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), + sjout=temporary( + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.SJ.out.tab" + ), log: - c.patterns['bam'].replace('.bam', '.star-pass2.bam.log') + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.log", threads: 16 resources: - mem_mb=gb(64), - runtime=autobump(hours=8) + mem="64g", + runtime="8h", + disk="80g", + params: + extra=STAR_PARAMS, run: genomedir = os.path.dirname(input.index[0]) outdir = os.path.dirname(output[0]) - prefix = output.bam.replace('.bam', '.star-pass2.') + prefix = output.bam.replace(".bam", ".star-pass2.") shell( - STAR_CMD + ( + STAR_CMD + + ( # In contrast to pass 1, we will be keeping these BAMs -- # so sort them - '--outSAMtype BAM SortedByCoordinate ' - + "--outSAMtype BAM SortedByCoordinate " # Splice junction databases from all samples in the first # pass. - '--sjdbFileChrStartEnd {input.sjout} ' - '--outStd BAM_SortedByCoordinate > {output.bam} ' - '2> {log} ' + "--sjdbFileChrStartEnd {input.sjout} " + "--outStd BAM_SortedByCoordinate > {output.bam} " + "2> {log} " ) ) # move various hard-coded log files to log directory - logfiles = expand(prefix + '{ext}', ext=logfile_extensions) - shell('mkdir -p {outdir}/star-pass2_logs ' - '&& mv {logfiles} {outdir}/star-pass2_logs') + logfiles = expand(prefix + "{ext}", ext=logfile_extensions) + shell( + "mkdir -p {outdir}/star-pass2_logs " + "&& mv {logfiles} {outdir}/star-pass2_logs" + ) - shell('rm -r {prefix}_STARgenome') + shell("rm -r {prefix}_STARgenome") rule rRNA: - """ - Map reads with bowtie2 to the rRNA reference - """ input: - fastq=r1_only(c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['rrna']['tag']]['bowtie2']] + fastq="data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz", + index=multiext( + f"{REFERENCES}/bowtie2/rrna", + ".1.bt2", + ".2.bt2", + ".3.bt2", + ".4.bt2", + ".rev.1.bt2", + ".rev.2.bt2", + ".fa", + ), output: - bam=temporary(c.patterns['rrna']['bam']) + bam="data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam", log: - c.patterns['rrna']['bam'] + '.log' + "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.log", threads: 6 resources: - mem_mb=gb(2), - runtime=autobump(hours=2) + mem="2g", + runtime="2h", + params: + extra=( + "-k 1 " + "--no-unal " + ), run: - prefix = aligners.prefix_from_bowtie2_index(input.index) - sam = output.bam.replace('.bam', '.sam') + prefix = os.path.commonprefix(input.index).rstrip(".") + sam = output.bam.replace(".bam", ".sam") shell( "bowtie2 " "-x {prefix} " "-U {input.fastq} " - '-k 1 ' # NOTE: we only care if >=1 mapped - '--no-unal ' # NOTE: suppress unaligned reads "--threads {threads} " + "{params.extra} " "-S {sam} " "> {log} 2>&1" ) @@ -505,618 +531,472 @@ rule rRNA: rule fastq_count: - """ - Count reads in a FASTQ file - """ input: - fastq='{sample_dir}/{sample}/{sample}{suffix}.fastq.gz' + fastq="{sample_dir}/{sample}/{sample}{suffix}.fastq.gz", output: - '{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize' + "{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize", threads: 1 resources: - mem_mb=gb(1), - runtime=autobump(hours=2) + mem="1g", + runtime="2h", shell: - 'zcat {input} | echo $((`wc -l`/4)) > {output}' + "zcat {input} | echo $((`wc -l`/4)) > {output}" rule bam_count: - """ - Count reads in a BAM file - """ input: - bam='{sample_dir}/{sample}/{suffix}.bam' + bam="{sample_dir}/{sample}/{suffix}.bam", output: - '{sample_dir}/{sample}/{suffix}.bam.libsize' + "{sample_dir}/{sample}/{suffix}.bam.libsize", threads: 1 resources: - mem_mb=gb(2), - runtime=autobump(hours=2) + mem="2g", + runtime="2h", shell: - 'samtools view -c {input} > {output}' + "samtools view -c {input} > {output}" rule bam_index: - """ - Index a BAM - """ input: - bam='{prefix}.bam' + bam="{prefix}.bam", output: - bai='{prefix}.bam.bai' + bai="{prefix}.bam.bai", threads: 1 resources: - mem_mb=gb(2), - runtime=autobump(hours=2) + mem="2g", + runtime="2h", shell: - 'samtools index {input} {output}' - + "samtools index {input} {output}" -def fastq_screen_references(): - """ - Returns the Bowtie2 indexes for the configured references from the - `fastq_screen:` section of the config - """ - refs = {} - for i in config['fastq_screen']: - refs[i['label']] = c.refdict[i['organism']][i['tag']]['bowtie2'] - return refs - -rule fastq_screen: - """ - Run fastq_screen to look for contamination from other genomes - """ +rule markduplicates: input: - **fastq_screen_references(), - fastq=r1_only(rules.cutadapt.output.fastq), + bam="data/rnaseq_samples/{sample}/{sample}.cutadapt.bam", output: - txt=c.patterns['fastq_screen'] + bam="data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam", + metrics="data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.metrics", log: - c.patterns['fastq_screen'] + '.log' - threads: 6 + "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.log", + threads: 1 resources: - mem_mb=gb(4), - runtime=autobump(hours=2) - params: subset=100000 - script: - wrapper_for('fastq_screen/wrapper.py') + mem="32g", + runtime="2h", + disk="100g", + params: + java_args="-Xmx20g", # [disable for test] + # java_args='-Xmx2g' # [enable for test] + shell: + "picard " + "{params.java_args} " + "MarkDuplicates " + "INPUT={input.bam} " + "OUTPUT={output.bam} " + "METRICS_FILE={output.metrics} " + "VALIDATION_STRINGENCY=LENIENT " + "&> {log}" rule featurecounts: - """ - Count reads in annotations with featureCounts from the subread package - """ input: - annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'], - bam=c.targets['markduplicates']['bam'] + annotation=rules.gtf.output, + bam=rules.markduplicates.output.bam, output: - counts='{sample_dir}/rnaseq_aggregation/featurecounts.txt' + "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt", log: - '{sample_dir}/rnaseq_aggregation/featurecounts.txt.log' + "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log", threads: 8 resources: - mem_mb=gb(16), - runtime=autobump(hours=2) + mem="16g", + runtime="2h", + params: + strand_arg={ + "unstranded": "-s0 ", + "fr-firststrand": "-s2 ", + "fr-secondstrand": "-s1 ", + }[config["stranded"]], + extra="", run: - # NOTE: By default, we use -p for paired-end - p_arg = '' - if c.is_paired: - p_arg = '-p --countReadPairs ' - - strand_arg = helpers.strand_arg_lookup( - c, { - 'unstranded': '-s0 ', - 'fr-firststrand': '-s2 ', - 'fr-secondstrand': '-s1 ', - } - ) - + p_arg = "" + if is_paired: + p_arg = "-p --countReadPairs " shell( - 'featureCounts ' - '{strand_arg} ' - '{p_arg} ' - '-T {threads} ' - '-a {input.annotation} ' - '-o {output.counts} ' - '{input.bam} ' - '&> {log}' + "featureCounts " + "{params.strand_arg} " + "{p_arg} " + "-T {threads} " + "-a {input.annotation} " + "-o {output} " + "{input.bam} " + "&> {log}" ) -rule rrna_libsizes_table: - """ - Aggregate rRNA counts into a table - """ +rule aggregate_featurecounts: input: - rrna=c.targets['rrna']['libsize'], - fastq=c.targets['libsizes']['cutadapt'] + expand( + "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt", sample=SAMPLES + ), output: - json=c.patterns['rrna_percentages_yaml'], - tsv=c.patterns['rrna_percentages_table'] + "data/rnaseq_aggregation/featurecounts.txt", + log: + "data/rnaseq_aggregation/featurecounts.txt.log", threads: 1 resources: - mem_mb=gb(2), - runtime=autobump(hours=2) + mem="8g", + runtime="1h" run: - def rrna_sample(f): - return helpers.extract_wildcards(c.patterns['rrna']['libsize'], f)['sample'] - - def sample(f): - return helpers.extract_wildcards(c.patterns['libsizes']['cutadapt'], f)['sample'] - - def million(f): - return float(open(f).read()) / 1e6 - - rrna = sorted(input.rrna, key=rrna_sample) - fastq = sorted(input.fastq, key=sample) - samples = list(map(rrna_sample, rrna)) - rrna_m = list(map(million, rrna)) - fastq_m = list(map(million, fastq)) - - df = pd.DataFrame(dict( - sample=samples, - million_reads_rRNA=rrna_m, - million_reads_fastq=fastq_m, - )) - df = df.set_index('sample') - df['rRNA_percentage'] = df.million_reads_rRNA / df.million_reads_fastq * 100 - - df[['million_reads_fastq', 'million_reads_rRNA', 'rRNA_percentage']].to_csv(output.tsv, sep='\t') - y = { - 'id': 'rrna_percentages_table', - 'section_name': 'rRNA content', - 'description': 'Amount of reads mapping to rRNA sequence', - 'plot_type': 'table', - 'pconfig': { - 'id': 'rrna_percentages_table_table', - 'title': 'rRNA content table', - 'min': 0 - }, - 'data': yaml.load(df.transpose().to_json(), Loader=yaml.FullLoader), - } - with open(output.json, 'w') as fout: - yaml.dump(y, fout, default_flow_style=False) + for i, file in enumerate(input): + df = pd.read_csv(file, sep="\t", comment="#") + df = df.set_index("Geneid", drop=False) + if i == 0: + final = df + continue + final[df.columns[-1]] = df[df.columns[-1]] + final.to_csv(output[0], sep="\t", index=False) -rule multiqc: - """ - Aggregate various QC stats and logs into a single HTML report with MultiQC - """ - # NOTE: if you add more rules and want MultiQC to pick up the output, then - # add outputs from those rules to the inputs here. +rule rrna_libsizes_table: input: - files=( - utils.flatten(c.targets['fastqc']) + - utils.flatten(c.targets['rrna_percentages_yaml']) + - utils.flatten(c.targets['cutadapt']) + - utils.flatten(c.targets['featurecounts']) + - utils.flatten(c.targets['markduplicates']) + - utils.flatten(c.targets['salmon']) + - utils.flatten(c.targets['rseqc']) + - utils.flatten(c.targets['fastq_screen']) + - utils.flatten(c.targets['preseq']) + - utils.flatten(c.targets['collectrnaseqmetrics']) + - utils.flatten(c.targets['samtools']) + rrna=expand( + "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize", + sample=SAMPLES, + ), + fastq=expand( + "data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize", + sample=SAMPLES, ), - config='config/multiqc_config.yaml' - output: c.targets['multiqc'] - log: c.targets['multiqc'][0] + '.log' - threads: 1 - resources: - mem_mb=gb(2), - runtime=autobump(hours=2) - run: - analysis_directory = set([os.path.dirname(i) for i in input]) - outdir = os.path.dirname(c.targets['multiqc'][0]) - basename = os.path.basename(c.targets['multiqc'][0]) - shell( - 'LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 ' - 'multiqc ' - '--quiet ' - '--outdir {outdir} ' - '--force ' - '--filename {basename} ' - '--config {input.config} ' - '{analysis_directory} ' - '&> {log} ' - ) - - -rule markduplicates: - """ - Mark or remove PCR duplicates with Picard MarkDuplicates - """ - input: - bam=c.patterns['bam'] output: - bam=c.patterns['markduplicates']['bam'], - metrics=c.patterns['markduplicates']['metrics'] - log: - c.patterns['markduplicates']['bam'] + '.log' - params: - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. - java_args='-Xmx20g' - # java_args='-Xmx2g' # [TEST SETTINGS -1] + tsv="data/rnaseq_aggregation/rrna_percentages_table.tsv", + json="data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml", threads: 1 + params: + rrna_pattern=lambda wc: "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize", + fastq_pattern=lambda wc: "data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize", resources: - mem_mb=gb(32), - runtime=autobump(hours=2), - disk_mb=autobump(gb=100), - shell: - 'picard ' - '{params.java_args} ' - 'MarkDuplicates ' - 'INPUT={input.bam} ' - 'OUTPUT={output.bam} ' - 'METRICS_FILE={output.metrics} ' - 'VALIDATION_STRINGENCY=LENIENT ' - '&> {log}' + mem="2g", + runtime="2h", + script: + "../../scripts/rrna_libsizes_table.py" rule collectrnaseqmetrics: - """ - Calculate various RNA-seq QC metrics with Picarc CollectRnaSeqMetrics - """ input: - bam=c.patterns['markduplicates']['bam'], - refflat=c.refdict[c.organism][config['gtf']['tag']]['refflat'] + bam=rules.markduplicates.output.bam, + refflat=rules.conversion_refflat.output, output: - metrics=c.patterns['collectrnaseqmetrics']['metrics'], - params: - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. - java_args='-Xmx20g' - # java_args='-Xmx2g' # [TEST SETTINGS -1] + metrics="data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics", log: - c.patterns['collectrnaseqmetrics']['metrics'] + '.log' + "data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics.log", threads: 1 resources: - mem_mb=gb(32), - runtime=autobump(hours=2) + mem="32g", + runtime="2h", + params: + java_args="-Xmx20g", # [disable for test] + # java_args='-Xmx2g', # [enable for test] + strand_arg={ + "unstranded": "STRAND=NONE ", + "fr-firststrand": "STRAND=SECOND_READ_TRANSCRIPTION_STRAND ", + "fr-secondstrand": "STRAND=FIRST_READ_TRANSCRIPTION_STRAND ", + }[config["stranded"]], run: - strand_arg = helpers.strand_arg_lookup( - c, { - 'unstranded': 'STRAND=NONE ', - 'fr-firststrand': 'STRAND=SECOND_READ_TRANSCRIPTION_STRAND ', - 'fr-secondstrand': 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND ', - } - ) shell( - 'picard ' - '{params.java_args} ' - 'CollectRnaSeqMetrics ' - '{strand_arg} ' - 'VALIDATION_STRINGENCY=LENIENT ' - 'REF_FLAT={input.refflat} ' - 'INPUT={input.bam} ' - 'OUTPUT={output.metrics} ' - '&> {log}' + "picard " + "{params.java_args} " + "CollectRnaSeqMetrics " + "{params.strand_arg} " + "VALIDATION_STRINGENCY=LENIENT " + "REF_FLAT={input.refflat} " + "INPUT={input.bam} " + "OUTPUT={output.metrics} " + "&> {log}" ) rule preseq: - """ - Compute a library complexity curve with preseq - """ input: - bam=c.patterns['bam'] + bam="data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam", output: - c.patterns['preseq'] + "data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt", + log: + "data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt.log", threads: 1 resources: - mem_mb=gb(1), - runtime=autobump(hours=2) + mem="1g", + runtime="2h", shell: - 'preseq ' - 'c_curve ' - '-B {input} ' - '-o {output} ' + "preseq " + "c_curve " + "-B {input} " + "-o {output} " + "&> {log}" rule salmon: - """ - Quantify reads coming from transcripts with Salmon - """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=c.refdict[c.organism][config['salmon']['tag']]['salmon'], + fastq=rules.cutadapt.output, + index=REFERENCES + "/salmon/versionInfo.json", output: - c.patterns['salmon'] - params: - index_dir=os.path.dirname(c.refdict[c.organism][config['salmon']['tag']]['salmon']), - outdir=os.path.dirname(c.patterns['salmon']) + "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf", log: - c.patterns['salmon'] + '.log' + "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf.log", threads: 6 resources: - mem_mb=gb(32), - runtime=autobump(hours=2) + mem="32g", + runtime="2h", + params: + extra=( + "--libType=A " + "--gcBias " + "--seqBias " + "--validateMappings " + ), run: - if c.is_paired: - fastq_arg = f'-1 {input.fastq[0]} -2 {input.fastq[1]} ' + outdir = os.path.dirname(output[0]) + index_dir = os.path.dirname(input.index) + if is_paired: + fastq_arg = f"-1 {input.fastq[0]} -2 {input.fastq[1]} " else: - fastq_arg = f'-r {input.fastq} ' + fastq_arg = f"-r {input.fastq} " shell( - 'salmon quant ' - '--index {params.index_dir} ' - '--output {params.outdir} ' - '--threads {threads} ' - - # NOTE: --libType=A auto-detects library type. Change if needed. - '--libType=A ' - - # NOTE: Docs suggest using --gcBias, --validateMappings, and - # --seqBias is a good idea - '--gcBias ' - '--seqBias ' - '--validateMappings ' - '{fastq_arg} ' - '&> {log}' + "salmon quant " + "--index {index_dir} " + "--output {outdir} " + "--threads {threads} " + "{params.extra} " + "{fastq_arg} " + "&> {log}" ) rule kallisto: - """ - Quantify reads coming from transcripts with Kallisto - """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=c.refdict[c.organism][config['kallisto']['tag']]['kallisto'], + fastq=rules.cutadapt.output, + index=REFERENCES + "/kallisto/transcripts.idx", output: - c.patterns['kallisto'] - params: - index_dir=os.path.dirname(c.refdict[c.organism][config['kallisto']['tag']]['kallisto']), - outdir=os.path.dirname(c.patterns['kallisto']) + "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5", log: - c.patterns['kallisto'] + '.log' - threads: - 8 + "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5.log", + threads: 8 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), + mem="32g", + runtime="2h", + params: + strand_arg={ + "unstranded": "", + "fr-firststrand": "--rf-stranded", + "fr-secondstrand": "--fr-stranded", + }[config["stranded"]], + extra=( + "--bootstrap-samples 100" + if is_paired + else "--single --fragment-length 300 --sd 20 --bootstrap-samples 100" + ), run: - if c.is_paired: - se_args = '' - assert len(input.fastq) == 2 - else: - # For single-end, add the experimentally-determined fragment length - # and standard deviation here - se_args = '--single --fragment-length 300 --sd 20 ' - assert len(input.fastq) == 1 - - strand_arg = helpers.strand_arg_lookup( - c, { - 'unstranded': '', - 'fr-firststrand': '--rf-stranded', - 'fr-secondstrand': '--fr-stranded', - } - ) - + outdir = os.path.dirname(output[0]) shell( - 'kallisto quant ' - '--index {input.index} ' - '--output-dir {params.outdir} ' - '--threads {threads} ' - '--bootstrap-samples 100 ' - '--bias ' - '--threads {threads} ' - '{se_args} ' - '{strand_arg} ' - '{input.fastq} ' - '&> {log}' + "kallisto quant " + "--index {input.index} " + "--output-dir {outdir} " + "--threads {threads} " + "--bootstrap-samples 100 " + "--threads {threads} " + "{params.strand_arg} " + "{params.extra} " + "{input.fastq} " + "&> {log}" ) + rule rseqc_infer_experiment: - """ - Infer strandedness of experiment - """ input: - bam=c.patterns['markduplicates']['bam'], - bed12=c.refdict[c.organism][config['gtf']['tag']]['bed12'] + bam=rules.markduplicates.output, + bed12=rules.conversion_bed12.output, output: - txt=c.patterns['rseqc']['infer_experiment'] + "data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt", log: - c.patterns['rseqc']['infer_experiment'] + '.log' + "data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt.log", resources: - mem_mb=gb(2), - runtime=autobump(hours=2) - + mem="2g", + runtime="2h", shell: - 'infer_experiment.py -r {input.bed12} -i {input.bam} > {output} &> {log}' + "infer_experiment.py -r {input.bed12} -i {input.bam} > {output} &> {log}" + rule rseqc_read_distribution: - """ - read distribution plots - """ input: - bam=c.patterns['markduplicates']['bam'], - bed12=c.refdict[c.organism][config['gtf']['tag']]['bed12'], + bam=rules.markduplicates.output, + bed12=rules.conversion_bed12.output, output: - txt=c.patterns['rseqc']['read_distribution'] + "data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt", log: - c.patterns['rseqc']['read_distribution'] + '.log' + "data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt.log", resources: - mem_mb=gb(2), - runtime=autobump(hours=2) + mem="2g", + runtime="2h", shell: - 'read_distribution.py -i {input.bam} -r {input.bed12} > {output} &> {log}' + "read_distribution.py -i {input.bam} -r {input.bed12} > {output} &> {log}" -rule idxstats: - """ - Run samtools idxstats on sample bams - """ +rule samtools_idxstats: input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai' + bam=rules.markduplicates.output.bam, + bai=rules.markduplicates.output.bam + ".bai", output: - txt=c.patterns['samtools']['idxstats'] - log: - c.patterns['samtools']['idxstats'] + '.log' + "data/rnaseq_samples/{sample}/idxstat_{sample}.txt", + log: + "data/rnaseq_samples/{sample}/idxstat_{sample}.txt.log", resources: - mem_mb=gb(16), - runtime=autobump(hours=2) - run: - shell( - 'samtools idxstats {input.bam} 2> {log} 1> {output.txt}' - ) + mem="16g", + runtime="2h", + shell: + "samtools idxstats {input.bam} 2> {log} 1> {output}" -# Common arguments used for bamCoverage rules below -BAMCOVERAGE_ARGS = ( - '--minMappingQuality 20 ' # excludes multimappers - '--smoothLength 10 ' # smooth signal with specified window - # '--normalizeUsing BPM ' # equivalent to TPM # [TEST SETTINGS] -) +rule samtools_flagstat: + input: + bam=rules.markduplicates.output.bam, + bai=rules.markduplicates.output.bam + ".bai", + output: + "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat", + log: + "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat.log", + resources: + mem="8g", + runtime="2h", + shell: + "samtools flagstat {input.bam} > {output}" + + +rule samtools_stats: + input: + bam=rules.markduplicates.output.bam, + bai=rules.markduplicates.output.bam + ".bai", + output: + "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats", + log: + "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats.log", + resources: + mem="8g", + runtime="2h", + shell: + "samtools stats {input.bam} > {output}" + rule bigwig_neg: - """ - Create a bigwig for negative-strand reads - """ input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai', - output: c.patterns['bigwig']['neg'] + bam=rules.markduplicates.output.bam, + bai=rules.markduplicates.output.bam + ".bai", + output: + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig", threads: 8 resources: - mem_mb=gb(16), - runtime=autobump(hours=2) + mem="16g", + runtime="2h", log: - c.patterns['bigwig']['neg'] + '.log' + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig.log", + params: + strand_arg={ + "unstranded": "", + "fr-firststrand": "--filterRNAstrand reverse ", + "fr-secondstrand": "--filterRNAstrand forward ", + }[config["stranded"]], + extra=( + "--minMappingQuality 20 " + "--smoothLength 10 " + "--normalizeUsing BPM " # [disable for test] + ), run: - strand_arg = helpers.strand_arg_lookup( - c, { - 'unstranded': '', - 'fr-firststrand': '--filterRNAstrand reverse ', - 'fr-secondstrand': '--filterRNAstrand forward ', - } - ) shell( - 'bamCoverage ' - '--bam {input.bam} ' - '-o {output} ' - '-p {threads} ' - '{BAMCOVERAGE_ARGS} ' - '{strand_arg} ' - '&> {log}' + "bamCoverage " + "--bam {input.bam} " + "-o {output} " + "-p {threads} " + "{params.extra} " + "{params.strand_arg} " + "&> {log}" ) rule bigwig_pos: - """ - Create a bigwig for postive-strand reads. - """ input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai', - output: c.patterns['bigwig']['pos'] + bam=rules.markduplicates.output.bam, + bai=rules.markduplicates.output.bam + ".bai", + output: + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig", threads: 8 resources: - mem_mb=gb(16), - runtime=autobump(hours=2) + mem="16g", + runtime="2h", log: - c.patterns['bigwig']['pos'] + '.log' - + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig.log", + params: + strand_arg={ + "unstranded": "", + "fr-firststrand": "--filterRNAstrand forward ", + "fr-secondstrand": "--filterRNAstrand reverse ", + }[config["stranded"]], + extra=( + "--minMappingQuality 20 " + "--smoothLength 10 " + "--normalizeUsing BPM " # [disable for test] + ), run: - strand_arg = helpers.strand_arg_lookup( - c, { - 'unstranded': '', - 'fr-firststrand': '--filterRNAstrand forward ', - 'fr-secondstrand': '--filterRNAstrand reverse ', - } - ) shell( - 'bamCoverage ' - '--bam {input.bam} ' - '-o {output} ' - '-p {threads} ' - '{BAMCOVERAGE_ARGS} ' - '{strand_arg} ' - '&> {log}' + "bamCoverage " + "--bam {input.bam} " + "-o {output} " + "-p {threads} " + "{params.extra} " + "{params.strand_arg} " + "&> {log}" ) -def bigwigs_to_merge(wc): - chunk = config['merged_bigwigs'][wc.merged_bigwig_label] - neg_labels = chunk.get('neg', []) - pos_labels = chunk.get('pos', []) - pos_bigwigs = expand( - c.patterns['bigwig']['pos'], - sample=pos_labels - ) - neg_bigwigs = expand( - c.patterns['bigwig']['neg'], - sample=neg_labels) - return pos_bigwigs + neg_bigwigs - -if 'merged_bigwigs' in config: - rule merge_bigwigs: - """ - Merge together bigWigs as specified in the config ("merged_bigwigs" - section). - """ - input: - bigwigs=bigwigs_to_merge, - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], - output: - c.patterns['merged_bigwig'] - log: - c.patterns['merged_bigwig'] + '.log' - resources: - mem_mb=gb(16), - runtime=autobump(hours=2) - script: - wrapper_for('average-bigwigs/wrapper.py') - - -rule rnaseq_rmarkdown: - """ - Run and render the RMarkdown file that performs differential expression - """ - input: - featurecounts=utils.flatten(c.targets['featurecounts']), - salmon=utils.flatten(c.targets['salmon']), - - # NOTE: the Rmd will likely need heavy editing depending on the project. - rmd='downstream/rnaseq.Rmd', - sampletable=config['sampletable'] - output: - 'downstream/rnaseq.html' - log: - 'downstream/rnaseq.log' - shell: - 'Rscript -e ' - '''"rmarkdown::render('{input.rmd}')" ''' - '> {log} 2>&1' - # [TEST_SETTINGS -1] - -rule flagstat: - input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai' - output: - c.patterns['samtools']['flagstat'] - log: - c.patterns['samtools']['flagstat'] + '.log' - shell: - 'samtools flagstat {input.bam} > {output}' - - -rule samtools_stats: +rule multiqc: input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai' + files=( + expand( + rules.fastqc.output.zip, + sample=SAMPLES, + suffix=["_R1.fastq.gz", "_R1.cutadapt.fastq.gz", ".cutadapt.bam"], + ), + expand(rules.markduplicates.output, sample=SAMPLES), + expand(rules.salmon.output, sample=SAMPLES), + expand(rules.kallisto.output, sample=SAMPLES), + expand(rules.preseq.output, sample=SAMPLES), + expand(rules.collectrnaseqmetrics.output, sample=SAMPLES), + expand(rules.samtools_stats.output, sample=SAMPLES), + expand(rules.samtools_flagstat.output, sample=SAMPLES), + expand(rules.samtools_idxstats.output, sample=SAMPLES), + expand(rules.rseqc_infer_experiment.output, sample=SAMPLES), + expand(rules.rseqc_read_distribution.output, sample=SAMPLES), + expand(rules.bigwig_pos.output, sample=SAMPLES), + expand(rules.bigwig_neg.output, sample=SAMPLES), + rules.rrna_libsizes_table.output, + ), + config="config/multiqc_config.yaml", output: - c.patterns['samtools']['stats'] + "data/rnaseq_aggregation/multiqc.html", log: - c.patterns['samtools']['stats'] + '.log' - shell: - 'samtools stats {input.bam} > {output}' - - - -# vim: ft=python + "data/rnaseq_aggregation/multiqc.log", + threads: 1 + resources: + mem="2g", + runtime="2h", + disk="10g", + run: + analysis_directory = set([os.path.dirname(i) for i in input]) + outdir = os.path.dirname(output[0]) + basename = os.path.basename(output[0]) + shell( + "LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 " + "multiqc " + "--quiet " + "--outdir {outdir} " + "--force " + "--filename {basename} " + "--config {input.config} " + "{analysis_directory} " + "&> {log} " + ) diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml index 7b0db18da..2cbd3d66c 100644 --- a/workflows/rnaseq/config/config.yaml +++ b/workflows/rnaseq/config/config.yaml @@ -1,59 +1,27 @@ +fasta: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" + postprocess: 'lib.utils.gzipped' + +gtf: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf" + postprocess: 'lib.utils.gzipped' + +rrna: + url: + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' + postprocess: + function: 'lib.utils.filter_fastas' + args: 'Drosophila melanogaster' + + sampletable: 'config/sampletable.tsv' patterns: 'config/rnaseq_patterns.yaml' -# Which key in the `references` dict below to use -organism: 'dmel' - -# If not specified here, use the environment variable REFERENCES_DIR. -references_dir: 'references_data' - # See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info. stranded: 'fr-firststrand' # for dUTP libraries # 'fr-secondstrand' # for ligation libraries # 'unstranded' # for libraries without strand specificity -aligner: - index: 'star' - tag: 'test' - -rrna: - index: 'bowtie2' - tag: 'rRNA' - -gtf: - tag: "test" - -salmon: - tag: "test" - -kallisto: - tag: "test" - -fastq_screen: - - label: rRNA - organism: dmel - tag: test - - label: Fly - organism: dmel - tag: test - -merged_bigwigs: - control_pos: - pos: - - sample1 - - sample2 - treatment_all: - pos: - - sample3 - - sample4 - neg: - - sample3 - - sample4 - -# See the reference config files in the top level of the repo, -# include/reference_configs, for inspiration for more species. - -include_references: - - '../../include/reference_configs/test.yaml' - - '../../include/reference_configs/Drosophila_melanogaster.yaml' +aligner: 'star' diff --git a/workflows/rnaseq/config/multiqc_config.yaml b/workflows/rnaseq/config/multiqc_config.yaml index 3e291495e..0fe650a73 100644 --- a/workflows/rnaseq/config/multiqc_config.yaml +++ b/workflows/rnaseq/config/multiqc_config.yaml @@ -53,7 +53,6 @@ module_order: - '*.cutadapt.fastq.gz_fastqc.zip' path_filters: - '*.fastq.gz_fastqc.zip' - - libsizes_table - rrna_percentages_table - cutadapt - fastqc: diff --git a/workflows/rnaseq/config/rnaseq_patterns.yaml b/workflows/rnaseq/config/rnaseq_patterns.yaml index 5379d0dcb..356811259 100644 --- a/workflows/rnaseq/config/rnaseq_patterns.yaml +++ b/workflows/rnaseq/config/rnaseq_patterns.yaml @@ -3,6 +3,7 @@ strand_check: bam: 'strand_check/{sample}/{sample}.strandedness.bam' tsv: 'strand_check/{sample}/{sample}.strandedness' fastq: 'data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz' +sra_fastq: 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz' cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz' bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam' fastqc: @@ -14,7 +15,9 @@ libsizes: cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize' bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.libsize' fastq_screen: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.screen.txt' -featurecounts: 'data/rnaseq_aggregation/featurecounts.txt' +featurecounts: + per_sample: 'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt' + aggregated: 'data/rnaseq_aggregation/featurecounts.txt' libsizes_table: 'data/rnaseq_aggregation/libsizes_table.tsv' libsizes_yaml: 'data/rnaseq_aggregation/libsizes_table_mqc.yaml' rrna_percentages_table: 'data/rnaseq_aggregation/rrna_percentages_table.tsv' diff --git a/workflows/rnaseq/rnaseq_trackhub.py b/workflows/rnaseq/rnaseq_trackhub.py index 912735744..6fe17f80e 100644 --- a/workflows/rnaseq/rnaseq_trackhub.py +++ b/workflows/rnaseq/rnaseq_trackhub.py @@ -9,8 +9,6 @@ """ import os -import sys -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) import re from pprint import pprint import pandas @@ -22,8 +20,6 @@ from trackhub.upload import upload_hub, stage_hub import argparse -from lib.patterns_targets import RNASeqConfig - ap = argparse.ArgumentParser() ap.add_argument('config', help='Main config.yaml file') ap.add_argument('hub_config', help='Track hub config YAML file') @@ -41,7 +37,6 @@ for cfg in args.additional_configs: update_config(config, yaml.load(open(cfg), Loader=yaml.FullLoader)) -c = RNASeqConfig(config, os.path.join(os.path.dirname(args.config), 'rnaseq_patterns.yaml')) hub, genomes_file, genome, trackdb = default_hub( hub_name=hub_config['hub']['name'], @@ -126,7 +121,7 @@ def decide_color(samplename): for direction in 'pos', 'neg': # ASSUMPTION: bigwig filename pattern - bigwig = c.patterns['bigwig'][direction].format(sample=sample) + bigwig = f"data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.{direction}.bigwig" subgroup = df[df.iloc[:, 0] == sample].to_dict('records')[0] subgroup = { diff --git a/wrappers/.gitignore b/wrappers/.gitignore deleted file mode 100644 index ede3cddab..000000000 --- a/wrappers/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -.test* -__pycache__ -.snakemake -.cache -**.snakemake* diff --git a/wrappers/LICENSE b/wrappers/LICENSE deleted file mode 100644 index 17b3ab770..000000000 --- a/wrappers/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2016 lcdb - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/wrappers/README.md b/wrappers/README.md deleted file mode 100644 index 79d134e91..000000000 --- a/wrappers/README.md +++ /dev/null @@ -1 +0,0 @@ -See documentation at http://lcdb-wf.readthedocs.io/en/latest/wrappers.html diff --git a/wrappers/test/conftest.py b/wrappers/test/conftest.py deleted file mode 100644 index d346905e1..000000000 --- a/wrappers/test/conftest.py +++ /dev/null @@ -1,10 +0,0 @@ -import os -import pytest -import tempfile -import shutil -import inspect -from snakemake.shell import shell -from snakemake.utils import makedirs -from lcdblib.snakemake import aligners - -from raw_data_fixtures import * diff --git a/wrappers/test/raw_data_fixtures.py b/wrappers/test/raw_data_fixtures.py deleted file mode 100644 index c19f8601b..000000000 --- a/wrappers/test/raw_data_fixtures.py +++ /dev/null @@ -1,180 +0,0 @@ -""" -Fixtures used for downloading data from the test data repo -""" - -import os -import pytest -from utils import tmpdir_for_func, _download_file, symlink_in_tempdir, run, dpath - -# ---------------------------------------------------------------------------- -# FASTQ files -@pytest.fixture(scope='session') -def sample1_se_fq(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.small_R1.fastq.gz' - return _download_file(fn, d) - -@pytest.fixture(scope='session') -def sample1_se_tiny_fq(tmpdir_factory): - """ - Single-end FASTQ file with 1010 reads - """ - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.tiny_R1.fastq.gz' - return _download_file(fn, d) - -@pytest.fixture(scope='session') -def sample1_pe_fq(tmpdir_factory): - pair = [] - d = tmpdir_for_func(tmpdir_factory) - for fn in [ - 'rnaseq_samples/sample1/sample1.small_R1.fastq.gz', - 'rnaseq_samples/sample1/sample1.small_R2.fastq.gz' - ]: - pair.append(_download_file(fn, d)) - return pair - -@pytest.fixture(scope='session') -def sample1_pe_tiny_fq(tmpdir_factory): - pair = [] - d = tmpdir_for_func(tmpdir_factory) - for fn in [ - 'rnaseq_samples/sample1/sample1.tiny_R1.fastq.gz', - 'rnaseq_samples/sample1/sample1.tiny_R2.fastq.gz' - ]: - pair.append(_download_file(fn, d)) - return pair - -# ---------------------------------------------------------------------------- -# BAM files - -@pytest.fixture(scope='session') -def sample1_se_bam(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.small.single.sorted.bam' - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def sample1_pe_bam(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.small.paired.sorted.bam' - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def sample1_se_tiny_bam(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.tiny.single.sorted.bam' - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def sample1_pe_tiny_bam(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.tiny.paired.sorted.bam' - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def sample1_se_bam_bai(sample1_se_bam, tmpdir_factory): - """ - Returns both the bam and the bam.bai - """ - snakefile = ''' - rule index: - input: bam='sample1.sorted.bam' - output: bai='sample1.sorted.bam.bai' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_bam: 'sample1.sorted.bam' - - } - ) - tmpdir = str(tmpdir_factory.mktemp('sample1_se_bam_bai')) - run(dpath('../wrappers/samtools/index'), snakefile, None, input_data_func, tmpdir) - return { - 'bam': os.path.join(tmpdir, 'sample1.sorted.bam'), - 'bai': os.path.join(tmpdir, 'sample1.sorted.bam.bai'), - } - - -@pytest.fixture(scope='session') -def sample1_se_tiny_bam_bai(sample1_se_tiny_bam, tmpdir_factory): - """ - Returns both the bam and the bam.bai - """ - snakefile = ''' - rule index: - input: bam='sample1.sorted.bam' - output: bai='sample1.sorted.bam.bai' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.sorted.bam' - - } - ) - tmpdir = str(tmpdir_factory.mktemp('sample1_se_tiny_bam_bai')) - run(dpath('../wrappers/samtools/index'), snakefile, None, input_data_func, tmpdir) - return { - 'bam': os.path.join(tmpdir, 'sample1.sorted.bam'), - 'bai': os.path.join(tmpdir, 'sample1.sorted.bam.bai'), - } - -# ---------------------------------------------------------------------------- -# Annotations - -@pytest.fixture(scope='session') -def transcriptome(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'seq/dm6.small.transcriptome.fa' - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def dm6_fa(tmpdir_factory): - fn = 'seq/dm6.small.fa' - d = tmpdir_for_func(tmpdir_factory) - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def annotation(tmpdir_factory): - fn = 'annotation/dm6.small.gtf' - d = tmpdir_for_func(tmpdir_factory) - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def annotation_refflat(tmpdir_factory): - fn = 'annotation/dm6.small.refflat' - d = tmpdir_for_func(tmpdir_factory) - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def annotation_db(annotation): - import gffutils - gffutils.create_db( - data=annotation, dbfn=annotation + '.db', - merge_strategy='merge', - id_spec={'transcript': ['transcript_id', 'transcript_symbol'], - 'gene': ['gene_id', 'gene_symbol']}, - gtf_transcript_key='transcript_id', - gtf_gene_key='gene_id') - return annotation + '.db' - - -@pytest.fixture(scope='session') -def annotation_bed12(annotation_db): - import gffutils - db = gffutils.FeatureDB(annotation_db) - bed12 = '.'.join(annotation_db.strip().split('.')[:-2]) + '.bed12' - with open(bed12, 'w') as handle: - for t in db.features_of_type('transcript'): - handle.write(db.bed12(t, name_field='transcript_id') + '\n') - return bed12 diff --git a/wrappers/test/test_atropos.py b/wrappers/test/test_atropos.py deleted file mode 100644 index f695202ee..000000000 --- a/wrappers/test/test_atropos.py +++ /dev/null @@ -1,156 +0,0 @@ -import pytest -import os -import gzip -from utils import run, dpath, symlink_in_tempdir - - -def test_atropos_simple(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule atropos: - input: - fastq='sample1_R1.fastq.gz' - output: - fastq='sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - threads: 2 - wrapper: "file:wrapper" - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2) - - -def test_atropos_simple_with_log(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule atropos: - input: - fastq='sample1_R1.fastq.gz' - output: - fastq='sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - threads: 2 - log: 'sample1.atropos.log' - wrapper: "file:wrapper" - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is Atropos' in open('sample1.atropos.log').readline() - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2) - - -def test_atropos_se_with_list(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule atropos: - input: 'sample1_R1.fastq.gz' - output: 'sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - threads: 2 - wrapper: "file:wrapper" - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2) - - -def test_atropos_pe(sample1_pe_tiny_fq, tmpdir): - snakefile = ''' - rule atropos: - input: - R1='sample1_R1.fastq.gz', - R2='sample1_R2.fastq.gz', - output: - R1='sample1_R1.trim.fastq.gz', - R2='sample2_R1.trim.fastq.gz', - params: extra='-a AAA' - threads: 2 - log: 'sample1.atropos.log' - wrapper: "file:wrapper" - ''' - input_data_func = symlink_in_tempdir( - { - sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz', - sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz', - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is Atropos' in open('sample1.atropos.log').readline() - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2) - - -def test_atropos_pe_with_list(sample1_pe_tiny_fq, tmpdir): - - snakefile = ''' - rule atropos: - input: 'sample1_R1.fastq.gz', 'sample1_R2.fastq.gz', - output: 'sample1_R1.trim.fastq.gz', 'sample2_R1.trim.fastq.gz', - params: extra='-a AAA' - threads: 2 - log: 'sample1.atropos.log' - wrapper: "file:wrapper" - ''' - input_data_func = symlink_in_tempdir( - { - sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz', - sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz', - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is Atropos' in open('sample1.atropos.log').readline() - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2) diff --git a/wrappers/test/test_bowtie2.py b/wrappers/test/test_bowtie2.py deleted file mode 100644 index 6ee9b76f1..000000000 --- a/wrappers/test/test_bowtie2.py +++ /dev/null @@ -1,95 +0,0 @@ -import os -import pytest -from snakemake.shell import shell -from lcdblib.snakemake import aligners -from utils import run, dpath, symlink_in_tempdir, tmpdir_for_func - - -@pytest.fixture(scope='session') -def bowtie2_indexes(dm6_fa, tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - snakefile = ''' - rule bowtie2: - input: fasta='dm6.fa' - output: index=['dm6.1.bt2', 'dm6.2.bt2'] - log: 'bowtie2.log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - dm6_fa: 'dm6.fa' - } - ) - - def check(): - assert 'Total time for backward call to driver' in open('bowtie2.log').readlines()[-1] - assert list(shell('bowtie2-inspect dm6 -n', iterable=True)) == ['2L', '2R'] - - run( - dpath('../wrappers/bowtie2/build'), - snakefile, check, input_data_func, d) - return aligners.bowtie2_index_from_prefix(os.path.join(d, 'dm6')) - - -def _dict_of_bowtie2_indexes(bowtie2_indexes, prefix): - d = {} - indexes = aligners.bowtie2_index_from_prefix(prefix) - bowtie2_indexes = sorted(bowtie2_indexes) - indexes = sorted(indexes) - for k, v in zip(bowtie2_indexes, indexes): - d[k] = v - return d - - -def test_bowtie2_align_se(bowtie2_indexes, sample1_se_tiny_fq, tmpdir): - d = _dict_of_bowtie2_indexes(bowtie2_indexes, 'dm6') - indexes = list(d.values()) - snakefile = ''' - rule bowtie2_align: - input: - fastq='sample1_R1.fastq.gz', - index={indexes} - output: - bam='sample1.bam' - log: "bowtie2.log" - wrapper: "file:wrapper" - '''.format(indexes=indexes) - d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz' - input_data_func = symlink_in_tempdir(d) - - def check(): - assert "overall alignment rate" in open('bowtie2.log').read() - - # should have at least some mapped and unmapped - assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) > 0 - assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0 - - run(dpath('../wrappers/bowtie2/align'), snakefile, check, input_data_func, tmpdir) - - -def test_bowtie2_align_se_rm_unmapped(bowtie2_indexes, sample1_se_tiny_fq, tmpdir): - d = _dict_of_bowtie2_indexes(bowtie2_indexes, 'dm6') - indexes = list(d.values()) - snakefile = ''' - rule bowtie2_align: - input: - fastq='sample1_R1.fastq.gz', - index={indexes} - output: - bam='sample1.bam' - params: - samtools_view_extra='-F 0x04' - log: "bowtie2.log" - wrapper: "file:wrapper" - '''.format(indexes=indexes) - d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz' - input_data_func = symlink_in_tempdir(d) - - def check(): - assert "overall alignment rate" in open('bowtie2.log').read() - - # should have at least some mapped and unmapped - assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) == 0 - assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0 - - run(dpath('../wrappers/bowtie2/align'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_cutadapt.py b/wrappers/test/test_cutadapt.py deleted file mode 100644 index 97f5c7f36..000000000 --- a/wrappers/test/test_cutadapt.py +++ /dev/null @@ -1,151 +0,0 @@ -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir - -def test_cutadapt_simple(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule cutadapt: - input: - fastq='sample1_R1.fastq.gz' - output: - fastq='sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir) - - -def test_cutadapt_simple_with_log(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule cutadapt: - input: - fastq='sample1_R1.fastq.gz' - output: - fastq='sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - log: 'sample1.cutadapt.log' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is cutadapt' in open('sample1.cutadapt.log').readline() - - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir) - - -def test_cutadapt_se_with_list(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule cutadapt: - input: 'sample1_R1.fastq.gz' - output: 'sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir) - -def test_cutadapt_pe(sample1_pe_tiny_fq, tmpdir): - snakefile = ''' - rule cutadapt: - input: - R1='sample1_R1.fastq.gz', - R2='sample1_R2.fastq.gz', - output: - R1='sample1_R1.trim.fastq.gz', - R2='sample2_R1.trim.fastq.gz', - params: extra='-a AAA' - log: 'sample1.cutadapt.log' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz', - sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz', - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is cutadapt' in open('sample1.cutadapt.log').readline() - - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir) - -def test_cutadapt_pe_with_list(sample1_pe_tiny_fq, tmpdir): - snakefile = ''' - rule cutadapt: - input: 'sample1_R1.fastq.gz', 'sample1_R2.fastq.gz', - output: 'sample1_R1.trim.fastq.gz', 'sample2_R1.trim.fastq.gz', - params: extra='-a AAA' - log: 'sample1.cutadapt.log' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz', - sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz', - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is cutadapt' in open('sample1.cutadapt.log').readline() - - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_deeptools.py b/wrappers/test/test_deeptools.py deleted file mode 100644 index cbf876904..000000000 --- a/wrappers/test/test_deeptools.py +++ /dev/null @@ -1,37 +0,0 @@ -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir -import pyBigWig - -def test_deeptools_bamCoverage(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, tmpdir): - snakefile = ''' - rule deeptools: - input: - bam='sample1.bam', - bai='sample1.bam.bai' - output: 'sample1.bw', - log: 'deeptools.log' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.bam', - sample1_se_tiny_bam_bai['bai']: 'sample1.bam.bai', - } - ) - - def check(): - bw = pyBigWig.open('sample1.bw') - header_keys = list(bw.header().keys()) - for k in ['maxVal', 'minVal', 'nBasesCovered', 'nLevels', 'sumData', - 'sumSquared', 'version']: - assert k in header_keys - - # bigWig version should be independent of BAM input, so we can check - # the value - assert bw.header()['version'] == 4 - - first_chrom = list(bw.chroms().keys())[0] - assert isinstance(bw.stats(first_chrom)[0], float) - - run(dpath('../wrappers/deeptools/bamCoverage'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_demo.py b/wrappers/test/test_demo.py deleted file mode 100644 index dd7be5ee6..000000000 --- a/wrappers/test/test_demo.py +++ /dev/null @@ -1,159 +0,0 @@ -# This file demonstrates tests for the `demo` wrapper. It is heavily commented, -# and is included as part of the test suite to ensure that it's correct. - -# The `run` function does most of the work. It creates a tempdir, copies over -# input data, Snakefile, and wrapper, runs the Snakefile, and runs -# a user-provided test function against the output. -from utils import run - - -# The `dpath` function figures out the path the wrapper even when in a tempdir -from utils import dpath - -# `symlink_in_tempdir` is a decorator function that lets us easily map fixtures -# to input files expected by our Snakefile. The examples below will demonstrate -# how it works. -from utils import symlink_in_tempdir - - -# A note on fixtures -# ------------------ -# -# py.test implicitly does a `from conftest import *`, so we will have the -# fixtures from that package available here. -# -# Currently we have the fixtures from raw_data_fixtures.py imported into -# conftest.py, which in turn makes them available in this file. -# -# py.test also includes a built-in `tmpdir` fixture which we use here to have -# a nicely-named tmpdir for running the test. -# -# See http://doc.pytest.org/en/latest/fixture.html for more info. - - -# Our first test. The test function names must start with `test_` in order for -# py.test to find them. -def test_demo(sample1_se_tiny_fq, tmpdir): - - # A note on these arguments - # ------------------------- - # - # Test function arguments are expected to be fixtures. The fixture - # `sample1_se_tiny_fq` will be the path to the downloaded example data. See - # conftest.sample1_se_tiny_fq(). - # - # The fixture `tmpdir` (which comes built-in with py.test) will be - # a py.path.local object pointing to a tempdir created just for this test. - # It will match the glob /tmp/pytest-*, and only the last 3 tempdirs are - # retained. - - # Write the snakefile - # ------------------- - # First we write the Snakefile to use in testing. Inputs need to come from - # fixutres. Write whatever filename you'd like; we'll connect the fixture - # to the written filename below. - # - # `snakefile` is typically a triple-quoted string; it will be automatically - # run through textwrap.dedent later so you don't have to worry about - # indentation. - # - # The wrapper will be copied to a subdirectory of the temp dir called, - # appropriately enough, "wrapper". So your snakefile will generally end - # with the line `wrapper: "file:wrapper"`. - snakefile = ''' - rule demo: - input: 'a.fastq.gz' - output: 'b.fastq.gz' - wrapper: "file:wrapper" - ''' - - # Map fixtures to input files - # --------------------------- - # Next we map the fixture sample1_se_tiny_fq (a temp file which has downloaded - # data from the test data repo into a temp dir) to the input file that our - # Snakefile expects. - # - # Keys are paths to downloaded example data (typically downloaded just once - # per py.test session), which is provided by the fixture. The values of the - # dict are paths relative to the Snakefile and must match what is expected - # by the snakefile. - # - # Technically, `symlink_in_tempdir` returns a function that takes a path as - # its argument and symlinks keys over to values within that path. While - # this seems a little convoluted, doing it this way means that we don't - # have to keep track -- or even care -- what the fixture's provided - # filename is, avoiding the need to keep looking back at the fixtures - # module to remember what the filenames are. It keeps the input file setup - # logic tightly coupled to the Snakefile, since they're both defined in the - # same function. - # - # So: since the above snakefile expects a.fastq.gz as input, we need to - # make that happen, like this: - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'a.fastq.gz' - } - ) - - # Write a test function - # --------------------- - # This is our test function. It will be called after the Snakefile has been - # run and it will be called in the same temp directory in which the - # Snakefile is run, so paths should be relative to the Snakefile. - # - # This function should not accept any arguments. - # - # In this case, the demo wrapper simply copies input to output, so here we - # assert the files are identical. - def check(): - assert open('a.fastq.gz', 'rb').read() == open('b.fastq.gz', 'rb').read() - - # Call `run()` - # ------------ - # Now that we have defined everything, the `run` function does all of the - # work. Note we pass the `tmpdir` fixture here. - # - # (that's because py.test manages tmpdirs for tests, which are in this - # current module, but run() lives in the utils module which won't get - # nicely managed. But run() needs to know where to build the test case, - # hence the need to pass it here) - run(dpath('../wrappers/demo'), snakefile, check, input_data_func, tmpdir) - - - -# This test function shows how to use downloaded paired-end data from -# a different fixture. -def test_demo_pe(sample1_pe_fq, tmpdir): - - # In contrast to the sample1_se_tiny_fq fixture used in the previous function, - # here the paired-end fixture `sample1_pe_fq` is a tuple of path names (see - # conftest.sample1_pe_fq()) - - - # The snakefile reflects what the wrapper expects for PE (see - # wrappers/demo/README.md). - snakefile = ''' - rule demo: - input: - R1='a1.fastq.gz', - R2='a2.fastq.gz' - output: - R1='b1.fastq.gz', - R2='b2.fastq.gz' - wrapper: "file:wrapper" - ''' - - # Map fixture to input files. Again, since this is paired-end we need to - # make sure both files are provided the right filename for testing. - input_data_func=symlink_in_tempdir( - { - sample1_pe_fq[0]: 'a1.fastq.gz', - sample1_pe_fq[1]: 'a2.fastq.gz', - } - ) - - def check(): - assert open('a1.fastq.gz', 'rb').read() == open('b1.fastq.gz', 'rb').read() - assert open('a2.fastq.gz', 'rb').read() == open('b2.fastq.gz', 'rb').read() - - run(dpath('../wrappers/demo'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_dupradar.py b/wrappers/test/test_dupradar.py deleted file mode 100644 index 6122bd5ca..000000000 --- a/wrappers/test/test_dupradar.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -import pytest -from test_picard import sample1_se_bam_markdups -from utils import symlink_in_tempdir, run, dpath - - -@pytest.fixture(scope='session') -def sample1_se_dupradar(sample1_se_bam_markdups, annotation, tmpdir_factory): - snakefile = ''' - rule dupradar: - input: - bam='sample1.bam', - annotation='dm6.gtf' - output: - density_scatter='sample1.density_scatter.png', - expression_histogram='sample1.expression_histogram.png', - expression_barplot='sample1.expression_barplot.png', - expression_boxplot='sample1.expression_boxplot.png', - multimapping_histogram='sample1.multimapping_histogram.png', - dataframe='sample1.dupradar.tsv', - model='sample1.model.txt', - curve='sample1.curve.txt' - wrapper: - 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_bam_markdups['bam']: 'sample1.bam', - annotation: 'dm6.gtf', - } - ) - tmpdir = str(tmpdir_factory.mktemp('dupradar_fixture')) - run(dpath('../wrappers/dupradar'), snakefile, None, input_data_func, tmpdir, use_conda=False) - mapping = dict( - density_scatter='sample1.density_scatter.png', - expression_histogram='sample1.expression_histogram.png', - expression_barplot='sample1.expression_barplot.png', - expression_boxplot='sample1.expression_boxplot.png', - multimapping_histogram='sample1.multimapping_histogram.png', - dataframe='sample1.dupradar.tsv', - ) - for k, v in mapping.items(): - mapping[k] = os.path.join(tmpdir, v) - return mapping - - -#@pytest.mark.xfail -def test_dupradar(sample1_se_dupradar): - assert open(sample1_se_dupradar['dataframe']).readline().startswith('"ID"\t"geneLength"') diff --git a/wrappers/test/test_fastq_screen.py b/wrappers/test/test_fastq_screen.py deleted file mode 100644 index 5cae9832c..000000000 --- a/wrappers/test/test_fastq_screen.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -import zipfile -from utils import run, dpath, rm, symlink_in_tempdir -from test_bowtie2 import bowtie2_indexes - -def test_fastq_screen(sample1_se_tiny_fq, bowtie2_indexes, tmpdir): - snakefile = ''' - rule fastq_screen: - input: - fastq='sample1_R1.fastq.gz', - dm6={indexes} - output: - txt='sample1_R1_screen.txt' - params: - subset=100000, - aligner='bowtie2' - wrapper: - "file:wrapper" - '''.format(indexes=bowtie2_indexes) - - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - with open('sample1_R1_screen.txt') as fh: - res = fh.readlines() - r1 = res[0].strip().split() - r3 = res[2].strip().split() - assert r1[-1] == '100000' - assert r3[0] == 'dm6' - - - run(dpath('../wrappers/fastq_screen'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_fastqc.py b/wrappers/test/test_fastqc.py deleted file mode 100644 index 5df5eda9c..000000000 --- a/wrappers/test/test_fastqc.py +++ /dev/null @@ -1,70 +0,0 @@ -import os -import zipfile -from utils import run, dpath, rm, symlink_in_tempdir - -import pytest -from utils import tmpdir_for_func, _download_file - -@pytest.fixture(scope='session') -def fastqc(sample1_se_tiny_fq, tmpdir_factory): - snakefile = ''' - rule fastqc: - input: - fastq='sample1_R1.fastq.gz' - output: - html='sample1_R1_fastqc.html', - zip='sample1_R1_fastqc.zip' - wrapper: "file:wrapper"''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - tmpdir = str(tmpdir_factory.mktemp('fastqc_fixture')) - run(dpath('../wrappers/fastqc'), snakefile, None, input_data_func, tmpdir) - return os.path.join(tmpdir, 'sample1_R1_fastqc.zip') - - -def test_fastqc(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule fastqc: - input: - fastq='sample1_R1.fastq.gz' - output: - html='results/sample1_R1.html', - zip='sample1_R1.zip' - wrapper: "file:wrapper"''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - assert '' in open('results/sample1_R1.html').readline() - contents = [ - 'sample1_R1_fastqc/', - 'sample1_R1_fastqc/Icons/', - 'sample1_R1_fastqc/Images/', - 'sample1_R1_fastqc/Icons/fastqc_icon.png', - 'sample1_R1_fastqc/Icons/warning.png', - 'sample1_R1_fastqc/Icons/error.png', - 'sample1_R1_fastqc/Icons/tick.png', - 'sample1_R1_fastqc/summary.txt', - 'sample1_R1_fastqc/Images/per_base_quality.png', - 'sample1_R1_fastqc/Images/per_tile_quality.png', - 'sample1_R1_fastqc/Images/per_sequence_quality.png', - 'sample1_R1_fastqc/Images/per_base_sequence_content.png', - 'sample1_R1_fastqc/Images/per_sequence_gc_content.png', - 'sample1_R1_fastqc/Images/per_base_n_content.png', - 'sample1_R1_fastqc/Images/sequence_length_distribution.png', - 'sample1_R1_fastqc/Images/duplication_levels.png', - 'sample1_R1_fastqc/Images/adapter_content.png', - 'sample1_R1_fastqc/fastqc_report.html', - 'sample1_R1_fastqc/fastqc_data.txt', - 'sample1_R1_fastqc/fastqc.fo' - ] - for i in zipfile.ZipFile('sample1_R1.zip').namelist(): - assert i in contents - - run(dpath('../wrappers/fastqc'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_featurecounts.py b/wrappers/test/test_featurecounts.py deleted file mode 100644 index cb3760f39..000000000 --- a/wrappers/test/test_featurecounts.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir - -def test_featurecounts_se(sample1_se_tiny_bam, annotation, tmpdir): - snakefile = ''' - rule featurecounts: - input: - annotation='dm6.gtf', - bam='sample1.bam' - output: - counts='sample1.counts', - log: 'featurecounts.log' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.bam', - annotation: 'dm6.gtf', - } - ) - - def check(): - assert '//===================' in open('featurecounts.log').read() - assert '# Program:featureCounts' in open('sample1.counts').readline() - assert open('sample1.counts.summary').readline().startswith('Status') - assert sum(1 for _ in open('sample1.counts')) == 169 - - run(dpath('../wrappers/featurecounts'), snakefile, check, input_data_func, tmpdir) - -def test_featurecounts_pe(sample1_pe_tiny_bam, annotation, tmpdir): - snakefile = ''' - rule featurecounts: - input: - annotation='dm6.gtf', - bam='sample1.bam' - output: - counts='sample1.counts', - log: 'featurecounts.log' - params: extra='-p -P -s 1 -B --splitOnly' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_pe_tiny_bam: 'sample1.bam', - annotation: 'dm6.gtf', - } - ) - - def check(): - assert '//===================' in open('featurecounts.log').read() - assert '# Program:featureCounts' in open('sample1.counts').readline() - assert open('sample1.counts.summary').readline().startswith('Status') - assert sum(1 for _ in open('sample1.counts')) == 169 - - # TODO: maybe assert that below a certain level are counted when all - # those extra arguments are used? - - run(dpath('../wrappers/featurecounts'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_hisat2.py b/wrappers/test/test_hisat2.py deleted file mode 100644 index add7abb07..000000000 --- a/wrappers/test/test_hisat2.py +++ /dev/null @@ -1,120 +0,0 @@ -import os -import pytest -from snakemake.shell import shell -from lcdblib.snakemake import aligners -from utils import run, dpath, symlink_in_tempdir, tmpdir_for_func - - -@pytest.fixture(scope='session') -def hisat2_indexes(dm6_fa, tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - snakefile = ''' - rule hisat2: - input: fasta='2L.fa' - output: index=['2L.1.ht2', '2L.2.ht2'] - log: 'hisat.log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - dm6_fa: '2L.fa' - } - ) - - def check(): - assert 'Total time for call to driver' in open('hisat.log').readlines()[-1] - assert list(shell('hisat2-inspect 2L -n', iterable=True)) == ['2L', '2R'] - - run( - dpath('../wrappers/hisat2/build'), - snakefile, check, input_data_func, d) - return aligners.hisat2_index_from_prefix(os.path.join(d, '2L')) - - -def _dict_of_hisat2_indexes(hisat2_indexes, prefix): - d = {} - indexes = aligners.hisat2_index_from_prefix(prefix) - hisat2_indexes = sorted(hisat2_indexes) - indexes = sorted(indexes) - for k, v in zip(hisat2_indexes, indexes): - d[k] = v - return d - - -def test_hisat2_align_se(hisat2_indexes, sample1_se_tiny_fq, tmpdir): - d = _dict_of_hisat2_indexes(hisat2_indexes, '2L') - indexes = list(d.values()) - snakefile = ''' - rule hisat2_align: - input: - fastq='sample1_R1.fastq.gz', - index={indexes} - output: - bam='sample1.bam' - log: "hisat2.log" - wrapper: "file:wrapper" - '''.format(indexes=indexes) - d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz' - input_data_func = symlink_in_tempdir(d) - - def check(): - assert "overall alignment rate" in open('hisat2.log').read() - - # should have at least some mapped and unmapped - assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) > 0 - assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0 - - run(dpath('../wrappers/hisat2/align'), snakefile, check, input_data_func, tmpdir) - - -def test_hisat2_align_se_SRA(hisat2_indexes, tmpdir): - d = _dict_of_hisat2_indexes(hisat2_indexes, '2L') - indexes = list(d.values()) - snakefile = ''' - rule hisat2_align: - input: - index={indexes} - output: - bam='sample1.bam' - params: hisat2_extra='--sra-acc SRR1990338' - log: "hisat2.log" - wrapper: "file:wrapper" - '''.format(indexes=indexes) - input_data_func = symlink_in_tempdir(d) - - def check(): - assert "overall alignment rate" in open('hisat2.log').read() - - # should have at least some mapped and unmapped - assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) > 0 - assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0 - - run(dpath('../wrappers/hisat2/align'), snakefile, check, input_data_func, tmpdir) - - -def test_hisat2_align_se_rm_unmapped(hisat2_indexes, sample1_se_tiny_fq, tmpdir): - d = _dict_of_hisat2_indexes(hisat2_indexes, '2L') - indexes = list(d.values()) - snakefile = ''' - rule hisat2_align: - input: - fastq='sample1_R1.fastq.gz', - index={indexes} - output: - bam='sample1.bam' - params: - samtools_view_extra='-F 0x04' - log: "hisat2.log" - wrapper: "file:wrapper" - '''.format(indexes=indexes) - d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz' - input_data_func = symlink_in_tempdir(d) - - def check(): - assert "overall alignment rate" in open('hisat2.log').read() - - # should have at least some mapped and unmapped - assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) == 0 - assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0 - - run(dpath('../wrappers/hisat2/align'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_kallisto.py b/wrappers/test/test_kallisto.py deleted file mode 100644 index 32e32e1bd..000000000 --- a/wrappers/test/test_kallisto.py +++ /dev/null @@ -1,69 +0,0 @@ -import os -import json -import pytest -import pysam -from snakemake.shell import shell -from lcdblib.snakemake import aligners -from utils import run, dpath, rm, symlink_in_tempdir, tmpdir_for_func - - -@pytest.fixture(scope='session') -def kallisto_index(tmpdir_factory, transcriptome): - d = tmpdir_for_func(tmpdir_factory) - snakefile = ''' - rule kallisto: - input: fasta='transcriptome.fa' - output: index='transcriptome.idx' - log: 'log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - transcriptome: 'transcriptome.fa', - } - ) - - def check(): - log = open('log').read() - assert '[build] target deBruijn graph' - - run( - dpath('../wrappers/kallisto/index'), - snakefile, check, input_data_func, d) - return os.path.join(d, 'transcriptome.idx') - - -def test_kallisto_quant(tmpdir, sample1_se_tiny_fq, kallisto_index): - snakefile = ''' - rule kallisto_quant: - input: - fastq='sample1.fq.gz', - index='out/transcriptome.idx' - - params: extra='--single --fragment-length=200 --sd=20' - output: - h5='quant/abundance.h5', - tsv='quant/abundance.tsv', - json='quant/run_info.json', - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1.fq.gz', - kallisto_index: 'out/transcriptome.idx', - } - ) - - def check(): - assert sum(1 for _ in open('quant/abundance.tsv')) == 310 - assert open('quant/abundance.tsv').readline() == ( - 'target_id\tlength\teff_length\test_counts\ttpm\n') - keys = ['call', 'index_version', 'n_bootstraps', 'n_processed', 'n_targets', 'start_time'] - d = json.load(open('quant/run_info.json')) - for k in keys: - assert k in d - - - run( - dpath('../wrappers/kallisto/quant'), - snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_multiqc.py b/wrappers/test/test_multiqc.py deleted file mode 100644 index 8f3618075..000000000 --- a/wrappers/test/test_multiqc.py +++ /dev/null @@ -1,48 +0,0 @@ -import pytest -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir -from test_fastqc import fastqc - - -def test_multiqc(fastqc, tmpdir): - snakefile = ''' - rule multiqc: - input: 'results/sample1_R1_fastqc.zip' - output: 'multiqc.html' - log: 'log' - params: - analysis_directory='results' - wrapper: 'file:wrapper' - ''' - input_data_func=symlink_in_tempdir( - { - fastqc: 'results/sample1_R1_fastqc.zip', - } - ) - - def check(): - assert '' in open('multiqc.html').readline() - - run(dpath('../wrappers/multiqc'), snakefile, check, input_data_func, tmpdir) - -def test_multiqc_other_dir(fastqc, tmpdir): - snakefile = ''' - rule multiqc: - input: 'results/sample1_R1_fastqc.zip' - output: 'reports/multiqc.html' - log: 'log' - params: - analysis_directory='results' - wrapper: 'file:wrapper' - ''' - input_data_func=symlink_in_tempdir( - { - fastqc: 'results/sample1_R1_fastqc.zip', - } - ) - - def check(): - assert '' in open('reports/multiqc.html').readline() - - run(dpath('../wrappers/multiqc'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_picard.py b/wrappers/test/test_picard.py deleted file mode 100644 index 659d116b0..000000000 --- a/wrappers/test/test_picard.py +++ /dev/null @@ -1,116 +0,0 @@ -import pytest -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir - - -@pytest.fixture(scope='session') -def sample1_se_bam_markdups(sample1_se_bam, tmpdir_factory): - snakefile = ''' - rule markduplicates: - input: - bam='sample1.bam' - output: - bam='sample1.dupsmarked.bam', - metrics='sample1.dupmetrics.txt' - log: 'log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_bam: 'sample1.bam', - } - ) - tmpdir = str(tmpdir_factory.mktemp('markduplicates_fixture')) - run(dpath('../wrappers/picard/markduplicates'), snakefile, None, input_data_func, tmpdir, use_conda=True) - return { - 'bam': os.path.join(tmpdir, 'sample1.dupsmarked.bam'), - 'metrics': os.path.join(tmpdir, 'sample1.dupmetrics.txt') - } - - -def test_markduplicates_se(sample1_se_bam_markdups, tmpdir): - assert open(sample1_se_bam_markdups['metrics']).readline().startswith('##') - - -def test_picard_collectrnaseqmetrics_se(sample1_se_tiny_bam, annotation_refflat, tmpdir): - snakefile = ''' - rule collectrnaseqmetrics: - input: - bam='sample1.bam', - refflat='dm6.refflat', - output: - metrics='sample1.metrics' - log: 'log' - params: - extra="STRAND=NONE", - java_args='-Xmx512m' - wrapper: 'file:wrapper' - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.bam', - annotation_refflat: 'dm6.refflat', - } - ) - - def check(): - assert '## METRICS CLASS' in open('sample1.metrics').read() - - run(dpath('../wrappers/picard/collectrnaseqmetrics'), snakefile, check, input_data_func, tmpdir, use_conda=True) - - -def test_picard_collectrnaseqmetrics_se_plot(sample1_se_tiny_bam, annotation_refflat, tmpdir): - snakefile = ''' - rule collectrnaseqmetrics: - input: - bam='sample1.bam', - refflat='dm6.refflat', - output: - metrics='sample1.metrics', - plot='sample1.pdf' - log: 'log' - params: extra="STRAND=NONE CHART=sample1.pdf" - wrapper: 'file:wrapper' - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.bam', - annotation_refflat: 'dm6.refflat', - } - ) - - def check(): - assert '## METRICS CLASS' in open('sample1.metrics').read() - - run(dpath('../wrappers/picard/collectrnaseqmetrics'), snakefile, check, input_data_func, tmpdir, use_conda=True) - - -@pytest.mark.xfail -def test_picard_collectrnaseqmetrics_too_small_heap(sample1_se_tiny_bam, annotation_refflat, tmpdir): - # set the java vm heap size to 128 bytes which should fail. This tests to - # make sure the java args are making it through to the wrapper. - snakefile = ''' - rule collectrnaseqmetrics: - input: - bam='sample1.bam', - refflat='dm6.refflat', - output: - metrics='sample1.metrics' - log: 'log' - params: - extra="STRAND=NONE", - java_args='-Xmx128' - wrapper: 'file:wrapper' - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.bam', - annotation_refflat: 'dm6.refflat', - } - ) - - def check(): - assert '## METRICS CLASS' in open('sample1.metrics').read() - - run(dpath('../wrappers/picard/collectrnaseqmetrics'), snakefile, check, input_data_func, tmpdir, use_conda=True) diff --git a/wrappers/test/test_rseqc.py b/wrappers/test/test_rseqc.py deleted file mode 100644 index d97ae9190..000000000 --- a/wrappers/test/test_rseqc.py +++ /dev/null @@ -1,151 +0,0 @@ -import pytest -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir -from textwrap import dedent - -def test_infer_experiment(sample1_se_tiny_bam, annotation_bed12, tmpdir): - snakefile = ''' - rule infer_experiment: - input: - bam='sample1_R1.bam', - bed='dm6.bed12' - output: - txt = 'sample1_R1.infer_experiment.txt' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1_R1.bam', - annotation_bed12: 'dm6.bed12' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - expected = dedent("""\ - This is SingleEnd Data - Fraction of reads failed to determine: - Fraction of reads explained by "++,--": - Fraction of reads explained by "+-,-+":""").splitlines(False) - - with open('sample1_R1.infer_experiment.txt', 'r') as handle: - results = handle.read().strip() - for ex in expected: - assert ex in results - - run(dpath('../wrappers/rseqc/infer_experiment'), snakefile, check, input_data_func, tmpdir, use_conda=True) - - -def test_gB_cov(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, annotation_bed12, tmpdir): - snakefile = ''' - rule geneBody_coverage: - input: - bam='sample1_R1.sort.bam', - bai='sample1_R1.sort.bam.bai', - bed='dm6.bed12' - output: txt='sample1_R1.geneBodyCoverage.txt', - r='sample1_R1.geneBodyCoverage.r', - img='sample1_R1.geneBodyCoverage.pdf', - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1_R1.sort.bam', - sample1_se_tiny_bam_bai['bai']: 'sample1_R1.sort.bam.bai', - annotation_bed12: 'dm6.bed12' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - - # R code - with open('sample1_R1.geneBodyCoverage.r', 'r') as handle: - result = handle.readline().split(' ')[0] - - assert result == 'sample1_R1.sort' - - # text - with open('sample1_R1.geneBodyCoverage.txt', 'r') as handle: - result = handle.readlines()[1].split('\t')[0] - - assert result == 'sample1_R1.sort' - - # PDF - assert os.path.exists('sample1_R1.geneBodyCoverage.pdf') - - run(dpath('../wrappers/rseqc/geneBody_coverage'), snakefile, check, input_data_func, tmpdir, use_conda=True) - - -def test_gB_cov_png(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, annotation_bed12, tmpdir): - snakefile = ''' - rule geneBody_coverage: - input: - bam='sample1_R1.sort.bam', - bai='sample1_R1.sort.bam.bai', - bed='dm6.bed12' - output: - txt='sample1_R1.geneBodyCoverage.txt', - r='sample1_R1.geneBodyCoverage.r', - img='sample1_R1.geneBodyCoverage.png', - params: - extra: = '-f png' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1_R1.sort.bam', - sample1_se_tiny_bam_bai['bai']: 'sample1_R1.sort.bam.bai', - annotation_bed12: 'dm6.bed12' - } - ) - - def check(): - """ Check that the PNG is created """ - assert os.path.exists('sample1_R1.geneBodyCoverage.png') - - -@pytest.mark.skip -def test_tin(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, annotation_bed12, tmpdir): - snakefile = ''' - rule tin: - input: - bam='sample1_R1.sort.bam', - bai='sample1_R1.sort.bam.bai', - bed='dm6.bed12' - output: table='sample1_R1.tin.tsv', - summary='sample1_R1.tin.summary.txt' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1_R1.sort.bam', - sample1_se_tiny_bam_bai['bai']: 'sample1_R1.sort.bam.bai', - annotation_bed12: 'dm6.bed12' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - - # R code - with open('sample1_R1.tin.tsv', 'r') as handle: - result = handle.readline().strip().split('\t') - - assert result == ['geneID', 'chrom', 'tx_start', 'tx_end', 'TIN'] - - # text - with open('sample1_R1.tin.summary.txt', 'r') as handle: - result = handle.readline().strip().split('\t') - - assert result == ['Bam_file', 'TIN(mean)', 'TIN(median)', 'TIN(stdev)'] - - run(dpath('../wrappers/rseqc/tin'), snakefile, check, input_data_func, tmpdir, use_conda=True) - diff --git a/wrappers/test/test_salmon.py b/wrappers/test/test_salmon.py deleted file mode 100644 index 2e3796fa5..000000000 --- a/wrappers/test/test_salmon.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -import pytest -from snakemake.shell import shell -from utils import run, dpath, rm, symlink_in_tempdir, tmpdir_for_func - - -@pytest.fixture(scope='session') -def salmon_index(tmpdir_factory, transcriptome): - d = tmpdir_for_func(tmpdir_factory) - snakefile = ''' - rule salmon: - input: fasta='transcriptome.fa' - output: hash='salmon_index/hash.bin' - log: 'log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - transcriptome: 'transcriptome.fa', - } - ) - - def check(): - log = open('log').read() - assert '[info] done building index' in log - - run( - dpath('../wrappers/salmon/index'), - snakefile, check, input_data_func, d) - return os.path.join(d, 'salmon_index') - - -def test_salmon_quant(tmpdir, sample1_se_tiny_fq, salmon_index): - snakefile = ''' - rule salmon_quant: - input: - unmatedReads='sample1.fq.gz', - index=['idx/hash.bin', 'idx/sa.bin'] - output: 'sample1/salmon/quant.sf' - params: extra='--libType A' - log: 'salmon.quant.log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1.fq.gz', - salmon_index: 'idx', - } - ) - - def check(): - assert open('sample1/salmon/quant.sf').readline() == ( - 'Name\tLength\tEffectiveLength\tTPM\tNumReads\n') - - run( - dpath('../wrappers/salmon/quant'), - snakefile, check, input_data_func, tmpdir) - -def test_salmon_quant_single_index(tmpdir, sample1_se_tiny_fq, salmon_index): - snakefile = ''' - rule salmon_quant: - input: - unmatedReads='sample1.fq.gz', - index='idx/hash.bin' - output: 'sample1/salmon/quant.sf' - params: extra='--libType A' - log: 'salmon.quant.log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1.fq.gz', - salmon_index: 'idx', - } - ) - - def check(): - assert open('sample1/salmon/quant.sf').readline() == ( - 'Name\tLength\tEffectiveLength\tTPM\tNumReads\n') - - run( - dpath('../wrappers/salmon/quant'), - snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_samtools.py b/wrappers/test/test_samtools.py deleted file mode 100644 index 51ff105af..000000000 --- a/wrappers/test/test_samtools.py +++ /dev/null @@ -1,12 +0,0 @@ -import subprocess as sp -import pytest -from snakemake import shell - - -def test_samtools_sort_and_index(sample1_se_tiny_bam, sample1_se_tiny_bam_bai): - """ - This test is primarily a trigger for the fixtures. - """ - with pytest.raises(sp.CalledProcessError): - shell('samtools view {sample1_se_tiny_bam} 2L:1-100') - shell('samtools view {sample1_se_tiny_bam_bai[bam]} 2L:1-100') diff --git a/wrappers/test/utils.py b/wrappers/test/utils.py deleted file mode 100644 index 74dd396bc..000000000 --- a/wrappers/test/utils.py +++ /dev/null @@ -1,152 +0,0 @@ -""" -Stripped-down version of Snakemake's test framework. -""" - -import sys -import os -from textwrap import dedent -import subprocess as sp -import tempfile -import hashlib -import urllib -import shutil -import shlex -import inspect - -import pytest -from snakemake import snakemake -from snakemake.shell import shell -from snakemake.utils import makedirs - - -SCRIPTPATH = shutil.which('snakemake') - -# test data url -URL = 'https://github.com/lcdb/lcdb-test-data/blob/add-chipseq/data/{}?raw=true' - - -def tmpdir_for_func(factory): - caller = inspect.stack()[1][3] - return str(factory.mktemp(caller)) - - -def _download_file(fn, d): - """ - Intended to be called from a pytest.fixture function. - - `fn` is a path to a file that is used to fill in `URL`. `d` is a tempdir - likely created by the calling function to which the file will be - downloaded. - - The path to the downloaded file is returned. - """ - url = URL.format(fn) - dest = os.path.join(d, fn) - makedirs(os.path.dirname(dest)) - basename = os.path.basename(fn) - shell('wget -q -O- {url} > {dest}') - return dest - - -def dpath(path): - "path relative to this file" - return os.path.realpath(os.path.join(os.path.dirname(__file__), path)) - - -def md5sum(filename): - data = open(filename, 'rb').read() - return hashlib.md5(data).hexdigest() - - -def run(path, snakefile, check=None, input_data_func=None, tmpdir=None, use_conda=False, **params): - """ - Parameters - ---------- - - path : str - Path to a wrapper directory. - - snakefile : str - Contents of a snakefile. `dedent()` will be run on it. - - check : callable or None - After running the snakefile on the input data, this function will be - called while inside the directory. This function is where the actual - tests (assertions etc) should be performed. - - If None, the snakefile will be run but no tests will be performed on - the output. - - input_data_func : None | callable - If not None, then this callable object will be called with - a single argument corresponding to the temp directory. It will be - called after the wrapper and test-case contents have been copied to the - temp dir, but before the test is run. It is expected to create any data - required in whatever directory structure is required. - - tmpdir : None or path - - """ - # store any tempdirs here for later deletion - to_clean_up = [] - - - if tmpdir is None: - tmpdir = tempfile.mkdtemp(prefix='.test', dir=os.path.abspath('.')) - else: - tmpdir = str(tmpdir) - try: - # copy over the wrapper - wrapper_dir = os.path.join(tmpdir, 'wrapper') - os.makedirs(wrapper_dir) - cmds = ( - 'find {} -maxdepth 1 -type f -print0 | xargs -0 cp -t {}' - .format(shlex.quote(path), shlex.quote(wrapper_dir)) - ) - sp.call(cmds, shell=True) - - # write the snakefile, filling in the "wrapper" placeholder - with open(os.path.join(tmpdir, 'Snakefile'), 'w') as fout: - fout.write('shell.executable("/bin/bash")\n') - fout.write(dedent(snakefile)) - - # Create the input data - input_data_func(tmpdir) - - success = snakemake(os.path.join(tmpdir, 'Snakefile'), workdir=tmpdir, stats='stats.txt', - snakemakepath=SCRIPTPATH, config={}, use_conda=use_conda, **params) - assert success, 'expected successful execution' - - # Change to the tmpdir and run the test function - if check is not None: - cwd = os.getcwd() - os.chdir(tmpdir) - check() - os.chdir(cwd) - - finally: - for t in to_clean_up: - shutil.rmtree(t) - #shutil.rmtree(tmpdir) - - -def symlink_in_tempdir(mapping): - """ - Returns a function that can be used for the `input_data_func` to utils.run. - - `mapping` is a dict where keys are 'target' and values are 'linkname'. - - It will symlink the data downloaded by the fixture into the temp dir - created for the test case. - """ - def _wrapped(tmpdir): - for k, v in mapping.items(): - _linkname = os.path.join(tmpdir, v) - _target = k - _linkdir = os.path.dirname(_linkname) - shell('mkdir -p {_linkdir} && ln -s {_target} {_linkname}') - return _wrapped - - -def rm(path): - shutil.rmtree(path) diff --git a/wrappers/test_toy.py b/wrappers/test_toy.py deleted file mode 100644 index a8e63a129..000000000 --- a/wrappers/test_toy.py +++ /dev/null @@ -1,100 +0,0 @@ -import os -from textwrap import dedent -import pytest -import utils - -# Each module has a config dict -config = dict() - - -def generic_fixture(key, mapping, factory): - """ - Tries to handle as much of the magic as possible. - - Parameters - ---------- - key : str - Key into the module-level config dict - - mapping : dict - Maps paths from fixtures to input files expected by the snakefile - - tmpdir : str - Path to temporary dir, usually created by utils.tmpdir_for_func - - Returns - ------- - After a successful Snakemake run, returns the dictionary of the config's - `output` key but with paths fixed to be relative to tmpdir. This returned - dict is ready to be used as a fixture by test functions. - """ - conf = config[key] - tmpdir = utils.tmpdir_for_func(factory) - input_data_func = utils.symlink_in_tempdir(mapping) - utils.run(utils.dpath(conf['wrapper']), conf['snakefile'], None, input_data_func, tmpdir) - output = conf['output'].copy() - for k, v in output.items(): - output[k] = os.path.join(tmpdir, v) - return output - - -# In order for the doc generation to find this config info without re-running -# all tests, it needs to be in the module-level dict. It similarly can't be -# added during the fixture function's runtime. -# -# However, the mapping and tmpdir must be provided by the function, so the -# config and the function are tightly coupled. -# -# So we add the item to the dictionary here, right above the function that will -# be using it to keep them tightly coupled in the file. -config['hisat2_index'] = dict( - description="Basic example of generating a hisat2 index", - wrapper="../wrappers/hisat2/build", - snakefile=""" - rule hisat2_build: - input: - fasta="2L.fa" - output: - index=expand("hisat2_index/assembly.{n}.ht2", n=range(1,9)) - log: "hisat.log" - wrapper: "file://wrapper" - """, - output={'prefix': 'hisat2_index/assembly'} -) - - -# All the hard work is done in the config and in generic_fixture(). Now we just -# need to set up the correct mapping of fixtures to input files. -@pytest.fixture(scope='module') -def hisat2_index(tmpdir_factory, dm6_fa): - mapping = {dm6_fa: '2L.fa'} - return generic_fixture('hisat2_index', mapping, tmpdir_factory) - -# The actual test. -def test_index(hisat2_index): - assert os.path.exists(hisat2_index['prefix'] + '.1.ht2') - - -def extract_examples_for_wrapper(wrapper): - """ - Returns the examples for the wrapper in markdown format. - - Parameters - ---------- - wrapper : str - Expected to be the value of one of the config dict's `wrapper` keys. - """ - markdown = [] - for k, v in config.items(): - if v['wrapper'] != wrapper: - continue - snakefile = dedent(v['snakefile']) - markdown.append( - dedent( - """ - {} - - ```python""".format(v['description']))) - markdown.append(snakefile) - markdown.append("```") - return "\n".join(markdown) diff --git a/wrappers/wrappers/atropos/README.md b/wrappers/wrappers/atropos/README.md deleted file mode 100644 index 56b28b18e..000000000 --- a/wrappers/wrappers/atropos/README.md +++ /dev/null @@ -1,167 +0,0 @@ -# Wrapper for atropos -[Atropos](https://atropos.readthedocs.io/en/latest/index.html) is a fork of -[Cutadapt](http://cutadapt.readthedocs.io/en/stable/index.html) which finds and -removes adapter sequences, primers, poly-A tails and other types of unwanted -sequence from your high-throughput sequencing reads. - -# Examples - -Minimal usage: - -``` -rule atropos: - input: fastq='{sample}.fastq' - output: fastq='{sample}.trim.fastq' - threads: 4 - wrapper: - "file://path/to/atropos" -``` - -Use an adapters file and quality-trim reads to Q20: - -``` -rule atropos: - input: fastq='{sample}.fastq' - output: fastq='{sample}.trim.fastq' - params: extra="-a file:adapters.fa -q 20" - threads: 4 - wrapper: - "file://path/to/atropos" -``` - -Optionally provide the adapters file as input in order to trigger a re-run if -it has changed. The wrapper only pays attention to `input.fastq`, so adding -another key doesn't affect the wrapper: - -``` -rule atropos: - input: - fastq='{sample}.fastq', - adapters='adapters.fa' - output: fastq='{sample}.trim.fastq' - params: extra="-a file:adapters.fa -q 20" - threads: 4 - wrapper: - "file://path/to/atropos" -``` - -Example of how to use with other output files. Since the wrapper only pays -attention to `output.fastq`, so other output files can be indicated but their -filenames have to be indicated in `params.`: - -``` -rule atropos: - input: - fastq='{sample}.fastq', - adapters='adapters.fa' - output: - fastq='{sample}.trim.fastq', - short='{sample}.trim.too-short.fastq', - untrimmed='{sample}.untrimmed.fastq', - params: - extra=( - "-a file:adapters.fa " - "-q 20 " - "--too-short-output={sample}.trim.too-short.fastq " - "--untrimmed-output={sample}.untrimmed.fastq" - ) - threads: 4 - wrapper: - "file://path/to/atropos" -``` - -You can also run in pair-end mode. - -``` -rule atropos: - input: - R1='{sample}_r1.fastq', - R2='{sample}_r2.fastq', - adapters='adapters.fa' - output: - R1='{sample}_r1.trim.fastq', - R1='{sample}_r2.trim.fastq' - params: extra="-a file:adapters.fa -A file:adapters.fa -q 20" - threads: 4 - wrapper: - "file://path/to/atropos" -``` - - -## Input - -All inputs are FASTQ files, and they can be optionally gzipped. - -### Single-end mode: - -fastq : single-end FASTQ file - -### Paired-end mode: - -R1 : Read 1 FASTQ -R2 : Read 2 FASTQ - -See examples below for other input options including adapters. - -## Output -q -### Single-end mode: - -fastq : Trimmed FASTQ file. - -### Paired-end mode: - -R1 : trimmed R1 FASTQ file -R2 : trimmed R2 FASTQ file - -See examples below for other output options. - -## Log -If a log file is specified, stdout and stderr will be captured there. - -## Threads -One improvement of atropos over cutadapt is the ability to use threads which -are passed to the `-T` option. - -## Params -Additional parameters can be passed to atropos verbatim by supplying a string -in `params.extra`. - - -## Notes - -To dynamically select PE or SE without using `dynamic` support in snakemake, -you can use a PHONY rule and use a function for `params.R2`, like in this -example: - -```python -def _input_func_atropos(wildcards): - """Determine if the sample is PE or SE""" - flags = some function to pull in se or pe info - if 'PE' in flags: - return {'R1': expand(fastqs['r1'], **wildcards)[0], 'R2': expand(fastqs['r2'], **wildcards)[0]} - else: - return {'R1': expand(fastqs['r1'], **wildcards)[0]} - -def _params_r2_atropos(wildcards): - """function to make temp R2 if pe.""" - flags = some function to pull in se or pe info - if 'PE' in flags: - return expand(patterns['atropos']['r2'], **wildcards)[0] + '.tmp.gz' - else: - return None - -rule atropos: - input: unpack(_input_func_atropos) - output: R1=temp(patterns['atropos']['r1']) - params: R2=_params_r2_atropos - threads: 8 - wrapper: wrapper_for('atropos') - -rule atropos_phony: - input: rules.atropos.output - output: temp(patterns['atropos']['r2']) - shell: """ - mv {output[0]}.tmp.gz {output[0]} - """ -``` diff --git a/wrappers/wrappers/atropos/environment.yaml b/wrappers/wrappers/atropos/environment.yaml deleted file mode 100644 index 314bcf2c4..000000000 --- a/wrappers/wrappers/atropos/environment.yaml +++ /dev/null @@ -1,4 +0,0 @@ -channels: - - bioconda -dependencies: - - atropos ==1.1.5 diff --git a/wrappers/wrappers/atropos/wrapper.py b/wrappers/wrappers/atropos/wrapper.py deleted file mode 100644 index b6af4311b..000000000 --- a/wrappers/wrappers/atropos/wrapper.py +++ /dev/null @@ -1,80 +0,0 @@ -__author__ = "Ryan Dale" -__copyright__ = "Copyright 2016, Ryan Dale" -__email__ = "dalerr@niddk.nih.gov" -__license__ = "MIT" - -from snakemake.shell import shell - -extra = snakemake.params.get('extra', '') -log = snakemake.log_fmt_shell() -inputs = snakemake.input -outputs = snakemake.output - -if isinstance(inputs, dict) and isinstance(outputs, dict): - # Get inputs - in_R1 = inputs.get('R1', None) - in_R2 = inputs.get('R2', None) - in_FASTQ = inputs.get('fastq', None) - - if (in_R1 is None) and (in_FASTQ is not None): - in_R1 = in_FASTQ - elif (in_R1 is None) and (in_FASTQ is None): - raise KeyError('If providing a dictionary for input/output, you must uese either ' - '`R1` or `fastq` for the first read. If providing a second read you must use `R2`.') - - # Get outputs - out_R1 = outputs.get('R1', None) - out_R2 = outputs.get('R2', snakemake.params.get('R2', None)) - out_FASTQ = outputs.get('fastq', None) - - if (out_R1 is None) and (out_FASTQ is not None): - out_R1 = out_FASTQ - elif (out_R1 is None) and (out_FASTQ is None): - raise KeyError('If providing a dictionary for input/output, you must uese either ' - '`R1` or `fastq` for the first read. If providing a second read you must use `R2`.') - -elif isinstance(inputs, list) and isinstance(outputs, list): - # Get inputs - if len(inputs) == 1: - in_R1 = inputs[0] - in_R2 = None - elif len(inputs) == 2: - in_R1 = sorted(inputs)[0] - in_R2 = sorted(inputs)[1] - else: - raise IndexError("If providing a list for input/output, they must have either 1 or 2 values.") - - # Get outputs - if len(outputs) == 1: - out_R1 = outputs[0] - out_R2 = snakemake.params.get('R2', None) - elif len(outputs) == 2: - out_R1 = sorted(outputs)[0] - out_R2 = sorted(outputs)[1] - else: - raise IndexError("If providing a list for input/output, they must have either 1 or 2 values.") - -# Run paired end if both in_R2 and out_R2 are provided -if (in_R2 is not None) and (out_R2 is not None): - shell( - "atropos trim " - "--threads {snakemake.threads} " - "{extra} " - "-pe1 {in_R1} " - "-pe2 {in_R2} " - "-o {out_R1} " - "-p {out_R2} " - "{log}" - ) -elif (in_R1 is not None) and (out_R1 is not None) and (in_R2 is None) and (out_R2 is None): - shell( - "atropos trim " - "{extra} " - "--threads {snakemake.threads} " - "-se {in_R1} " - "-o {out_R1} " - "{log}" - ) -else: - raise ValueError("Input and Output must match. If you give two value for " - "input you must give two values for output.") diff --git a/wrappers/wrappers/average-bigwigs/README.md b/wrappers/wrappers/average-bigwigs/README.md deleted file mode 100644 index af837c1f0..000000000 --- a/wrappers/wrappers/average-bigwigs/README.md +++ /dev/null @@ -1,75 +0,0 @@ -# Average bigWigs - -Often we'd like to merge multiple bigWigs together for downstream work -(heatmaps, etc) but there's no single tool to do this. This wrapper runs -`bigWigMerge` on the inputs to sum their values, then uses `awk` to divide by -their values and sort the way bedGraphToBigWig wants them. - -The intermediate bedGraph file will be created in ``$TMPDIR``. - -## Examples - -Minimal usage: - -```python -rule average_bigwigs: - input: - bigwigs=[ - 'a.bw', - 'b.bw', - 'c.bw'], - chromsizes='genome.chromsizes' - output: - 'out.bw' - wrapper: - 'file://path/to/wrapper' -``` - -Increase memory used for sorting: - -```python -rule average_bigwigs: - input: - bigwigs=[ - 'a.bw', - 'b.bw', - 'c.bw'], - chromsizes='genome.chromsizes' - output: - 'out.bw' - params: - memory='32G' - wrapper: - 'file://path/to/wrapper' -``` - -Single bigwig just gets symlinked over. - -```python -rule average_bigwigs: - input: - bigwigs='a.bw', - chromsizes='genome.chromsizes' - output: - 'out.bw' - params: - memory='32G' - wrapper: - 'file://path/to/wrapper' -``` - -## Input - -List of bigWig files. - - -## Output - -Single bigWig file created by averaging the inputs - -## Threads -Does not use threads - -## Params - -memory: Passed to `sort` as the `-S` argument. diff --git a/wrappers/wrappers/average-bigwigs/environment.yaml b/wrappers/wrappers/average-bigwigs/environment.yaml deleted file mode 100644 index 64dcd1557..000000000 --- a/wrappers/wrappers/average-bigwigs/environment.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - bioconda -dependencies: - - ucsc-bigwigmerge - - ucsc-bedgraphtobigwig diff --git a/wrappers/wrappers/average-bigwigs/wrapper.py b/wrappers/wrappers/average-bigwigs/wrapper.py deleted file mode 100644 index 94be840ac..000000000 --- a/wrappers/wrappers/average-bigwigs/wrapper.py +++ /dev/null @@ -1,32 +0,0 @@ -import os, sys -sys.path.append(os.path.abspath('../../')) -from lib import utils -import tempfile -from snakemake.shell import shell -# Inspired by http://wresch.github.io/2014/01/31/merge-bigwig-files.html - -# If memory was supplied, we'll use that for sorting. -if 'memory' in snakemake.params: - mem_arg = '-S {snakemake.params.memory}' -else: - mem_arg = '' - -if len(snakemake.input.bigwigs) == 1: - utils.make_relative_symlink(snakemake.input.bigwigs[0], snakemake.output[0]) - -else: - - # bigWigMerge outputs sum; we need to divide each by n. - f = 1.0 / len(snakemake.input.bigwigs) - - tmp = tempfile.NamedTemporaryFile(delete=False).name - tmpdir = tempfile.gettempdir() - - shell( - 'export LC_ALL=C; ' - 'bigWigMerge {snakemake.input.bigwigs} stdout 2> {snakemake.log} ' - """| awk 'BEGIN{{OFS="\t"}}{{$4={f}*$4; print}}' """ - '| sort {mem_arg} -T {tmpdir} -k1,1 -k2,2n > {tmp} ' - '&& bedGraphToBigWig {tmp} {snakemake.input.chromsizes} ' - '{snakemake.output} &>> {snakemake.log}' - ) diff --git a/wrappers/wrappers/combos/merge_and_dedup/README.md b/wrappers/wrappers/combos/merge_and_dedup/README.md deleted file mode 100644 index b768e7d22..000000000 --- a/wrappers/wrappers/combos/merge_and_dedup/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# Merge and deduplicate - -Merges BAM files and then deduplicates the output. However if only one BAM file -is created, the file is simply symlinked. - -This wrapper is often needed in ChIP-seq to merge technical replicates. The -same fragment could have been sequenced in multiple tech reps, resulting in -duplicate reads in the merged output even though each individual BAM already -had duplicates removed. - -This method has an advantage over merging first and then deduping in separate -rules when we want to retain both individual (per tech rep) deduped BAMs as -well as merged deduped BAMs. Since the deduping has already happened once for -each tech rep, we want to avoid doing so again if no merging happens. - -## Examples - -Minimal usage: - -```python -rule merge_and_dedup: - input: 'a1.bam', 'a2.bam' - output: - bam='a-merged.bam', - metrics='a-merged.bam.metrics' - wrapper: - 'file://path/to/wrapper' -``` - -In the following case, a symlink will be created since no merging needs to be -performed on a single file: - -```python -rule merge_and_dedup: - input: 'a1.bam' - output: - bam='a-merged.bam', - metrics='a-merged.bam.metrics' - wrapper: - 'file://path/to/wrapper' -``` - - -## Input - -Single BAM or list of BAMs. - -## Output - -- `bam`: output bam file -- `metrics`: optional output metrics file. Default is to use - `{snakemake.output.bam}.metrics`. - -## Threads - -Threads are passed to `samtools merge`. - -## Params - -- `samtools_merge_extra`: addtional args passed verbatim to `samtools merge` - -- `markduplicates_extra`: addtional args passed verbatim to `markduplicates_extra` - -- `java_args`: passed to MarkDuplicates, often used to provide more memory - (e.g., `-Xmx32g`). Be sure to increase the corresponding rule's memory - resource to account for the additional allocation diff --git a/wrappers/wrappers/combos/merge_and_dedup/environment.yaml b/wrappers/wrappers/combos/merge_and_dedup/environment.yaml deleted file mode 100644 index b3e77ddb9..000000000 --- a/wrappers/wrappers/combos/merge_and_dedup/environment.yaml +++ /dev/null @@ -1,7 +0,0 @@ -channels: - - bioconda - - conda-forge - -dependencies: - - picard - - samtools diff --git a/wrappers/wrappers/demo/README.md b/wrappers/wrappers/demo/README.md deleted file mode 100644 index a87fb3aa3..000000000 --- a/wrappers/wrappers/demo/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# Demo wrapper - -This wrapper demonstrates current best-practices. - -The target audience of the wrapper's README should be yourself six months from -now, under a tight deadline, frantically looking for that rule you wrote so you -can copy/paste into a custom Snakefile. - -Examples should come first. There should be at least a minimal example and -a reasonably complicated example. To be complete you can add links to docs, -a brief description of the tool, and example output. - -This demo wrapper simply copies input files to output files. - -## Examples - -Minimal usage: - -```python -rule demo: - input: 'a.txt' - output: 'b.txt' - wrapper: - 'file://path/to/wrapper' -``` - -"paired-end" usage: - -```python -rule demo: - input: - R1='a1.txt', - R2='a2.txt' - output: - R1='b1.txt', - R2='b2.txt' - wrapper: - 'file://path/to/wrapper' -``` - -## Input - -Input file formats for this wrapper can be anything. - -### Single-end mode: - -Expects a single unnamed input file. - -### Paired-end mode: - -Expects two input files with keys `R1` and `R2`. - -## Output - -Output files are simply copies of input. - -### Single-end mode: - -Expects a single unnamed output file - -### Paired-end mode: - -Expects two output files with keys `R1` and `R2`. - -## Threads -Does not use threads - -## Params -Does not use params diff --git a/wrappers/wrappers/demo/environment.yaml b/wrappers/wrappers/demo/environment.yaml deleted file mode 100644 index f56993b24..000000000 --- a/wrappers/wrappers/demo/environment.yaml +++ /dev/null @@ -1,4 +0,0 @@ -channels: - - defaults -dependencies: - - python=3 diff --git a/wrappers/wrappers/demo/wrapper.py b/wrappers/wrappers/demo/wrapper.py deleted file mode 100644 index 158ce4090..000000000 --- a/wrappers/wrappers/demo/wrapper.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python - -from snakemake.shell import shell - -# All wrappers must be able to handle an optional params.extra. -extra = snakemake.params.get('extra', '') - - -# This lets us handle whether to write to a log file or to write to stdout. -# See snakemake.script.log_fmt_shell for details. -log = snakemake.log_fmt_shell() - - -# This demo shows how to handle paired-end and single-end input data as two -# different cases, depending on whether the rule's input included an "R2" key -# or not. -paired_end = ( - 'R1' in snakemake.input.keys() and - 'R2' in snakemake.input.keys() -) - -if paired_end: - shell('cp {snakemake.input.R1} {snakemake.output.R1}') - shell('cp {snakemake.input.R2} {snakemake.output.R2}') - -else: - shell("cp {snakemake.input} {snakemake.output} {log}") diff --git a/wrappers/wrappers/dupradar/README.md b/wrappers/wrappers/dupradar/README.md deleted file mode 100644 index 0667bd9cd..000000000 --- a/wrappers/wrappers/dupradar/README.md +++ /dev/null @@ -1,83 +0,0 @@ -# Wrapper for dupRadar - -dupRadar provides an easy way to distinguish between artifactual vs natural -duplicate reads in RNA-Seq data. Prior to dupRadar only global duplication rates -were used and they don't take into account the effect of gene expression levels. -dupRadar relates *duplication rates* and *length normalized read counts* of every -gene to model the dependency of both variables. - -[Link to homepage](https://www.bioconductor.org/packages/release/bioc/html/dupRadar.html) - -[Link to manual](https://www.bioconductor.org/packages/devel/bioc/vignettes/dupRadar/inst/doc/dupRadar.html) - -## Example - -Single-end, not stranded: - -```python -rule dupRadar: - input: - bam='sample1.bam', - annotation='dm6.gtf', - output: - density_scatter='sample1.density_scatter.png', - expression_histogram='sample1.expression_histogram.png', - expression_boxplot='sample1.expression_boxplot.png', - expression_barplot='sample1.expression_barplot.png', - multimapping_histogram='sample1.multimapping_histogram.png', - dataframe='sample1.dupradar.tsv' - wrapper: - wrapper_for('dupRadar') -``` - -Paired-end, stranded: - -```python -rule dupRadar: - input: - bam='{sample_dir}/{sample}/{sample}.cutadapt.hisat2.unique.sort.dedup.bam', - annotation='annotations/dm6.gtf', - output: - density_scatter='sample1.density_scatter.png', - expression_histogram='sample1.expression_histogram.png', - expression_boxplot='sample1.expression_boxplot.png', - expression_barplot='sample1.expression_barplot.png', - dataframe='sample1.dupradar.tsv' - params: - paired=True, - stranded=True - wrapper: - wrapper_for('dupRadar') -``` - -## Input -* `bam`: BAM file with mapped reads has to be duplicate marked using either - Picard or BamUtil - -* `annotation`: GTF file contaning features to count the reads falling on the - features. - -## Output -Output plots are described in the [dupRadar -vignette)[http://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html]. -See that page for descriptions of outputs and how to interpret them. - -* `density_scatter`: expression vs percent duplication -* `expression_boxplot`: expression vs percent duplication, binned into boxes -* `expression_histogram`: standard histogram of expression (RPKM) -* `expression_barplot`: percentage duplication in 5% expression bins. -* `multimapping_histogram`: histogram showing fraction of reads coming from - multimapping reads -* `dataframe`: results from `analyzeDuprates` saved as a TSV for downstream - analysis. Following the vignette, we also add the fraction of multimappers in - each gene as the column `mhRate`. -* `model`: Slope and intercept of the dupsExpFit -* `curve`: Simplified curve of the GLM for downstream plotting - -## Threads -Threads are passed to dupRadar and are in turn passed to featureCounts, which -it calls automatically. - -## Params -* `paired`: True | False. Default False. -* `stranded`: True | False | "reverse". Default False. diff --git a/wrappers/wrappers/dupradar/environment.yaml b/wrappers/wrappers/dupradar/environment.yaml deleted file mode 100644 index d59b35e11..000000000 --- a/wrappers/wrappers/dupradar/environment.yaml +++ /dev/null @@ -1,10 +0,0 @@ -channels: - - conda-forge - - bioconda - - lcdb -dependencies: - - python=3 - - bioconductor-dupradar - - r-kernsmooth - - r-base >=3.5.1 - - ghostscript diff --git a/wrappers/wrappers/dupradar/wrapper.py b/wrappers/wrappers/dupradar/wrapper.py deleted file mode 100644 index e9ef30d6a..000000000 --- a/wrappers/wrappers/dupradar/wrapper.py +++ /dev/null @@ -1,94 +0,0 @@ -import tempfile -from snakemake.shell import shell -import os, sys -sys.path.append(os.path.abspath('../..')) -from lib import helpers - -extra = snakemake.params.get('extra', '') -try: - log = snakemake.log -except AttributeError: - log = None - -stranded = snakemake.params.get('stranded', False) -try: - stranded_int = {False: 0, True: 1, 'reverse': 2}[stranded] -except KeyError: - raise ValueError('"stranded" must be True|False|"reverse"') - -paired = snakemake.params.get('paired', False) -try: - paired_bool= {True: 'TRUE', False: 'FALSE'}[paired] -except KeyError: - raise ValueError('"paired" must be True or False') - -tempdir = tempfile.mkdtemp() - -# To avoid issues with png() related to X11 and cairo, we can use bitmap() instead. -# (thanks -# http://stackoverflow.com/questions/24999983/ -# r-unable-to-start-device-png-capabilities-has-true-for-png -# #comment52353278_25064603 ) - -script = """ -library(dupRadar) -bam <- "{snakemake.input.bam}" -gtf <- "{snakemake.input.annotation}" -dm <- analyzeDuprates(bam, gtf, {stranded_int}, {paired_bool}, {snakemake.threads}, tmpDir = "{tempdir}") - -dm$mhRate <- (dm$allCountsMulti - dm$allCounts) / dm$allCountsMulti -bitmap(file="{snakemake.output.multimapping_histogram}") -hist(dm$mhRate, breaks=50, main=basename(bam), - xlab="Multimapping rate per gene", ylab="Frequency") -dev.off() - -bitmap(file="{snakemake.output.density_scatter}") -duprateExpDensPlot(dm, main=basename(bam)) -dev.off() - -bitmap(file="{snakemake.output.expression_histogram}") -expressionHist(dm) -dev.off() - -bitmap(file="{snakemake.output.expression_boxplot}") -par(mar=c(10,4,4,2)+.1) -duprateExpBoxplot(dm, main=basename(bam)) -dev.off() - -bitmap(file="{snakemake.output.expression_barplot}") -readcountExpBoxplot(dm) -dev.off() - -write.table(dm, file="{snakemake.output.dataframe}", sep="\\t") - -# The following is from -# https://github.com/ewels/NGI-RNAseq/blob/master/bin/dupRadar.r - -fit <- duprateExpFit(DupMat=dm) -df <- data.frame(intercept=as.numeric(fit$intercept), slope=c(fit$slope)) -cat("# dupRadar model params\\n", file="{snakemake.output.model}") -write.table(df, file="{snakemake.output.model}", sep="\\t", append=TRUE, row.names=FALSE) - -# Get numbers from dupRadar GLM -curve_x <- sort(log10(dm$RPK)) -curve_y = 100*predict(fit$glm, data.frame(x=curve_x), type="response") -# Remove all of the infinite values -infs = which(curve_x %in% c(-Inf,Inf)) -curve_x = curve_x[-infs] -curve_y = curve_y[-infs] -# Reduce number of data points -curve_x <- curve_x[seq(1, length(curve_x), 10)] -curve_y <- curve_y[seq(1, length(curve_y), 10)] -# Convert x values back to real counts -curve_x = 10^curve_x -# Write to file -write.table( - cbind(curve_x, curve_y), - file="{snakemake.output.curve}", - quote=FALSE, row.names=FALSE -) -""".format(**locals()) - -tmp = tempfile.NamedTemporaryFile(delete=False).name -helpers.rscript(script, tmp, log=log) -shell("rm -r {tempdir}") diff --git a/wrappers/wrappers/epic2/environment.yaml b/wrappers/wrappers/epic2/environment.yaml deleted file mode 100644 index cacda5daa..000000000 --- a/wrappers/wrappers/epic2/environment.yaml +++ /dev/null @@ -1,8 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - epic2 - - numpy - - bedtools - - ucsc-bedsort=377 diff --git a/wrappers/wrappers/fastq-dump/environment.yaml b/wrappers/wrappers/fastq-dump/environment.yaml deleted file mode 100644 index 6653b6cc8..000000000 --- a/wrappers/wrappers/fastq-dump/environment.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - sra-tools>=3 diff --git a/wrappers/wrappers/fastq-dump/wrapper.py b/wrappers/wrappers/fastq-dump/wrapper.py deleted file mode 100644 index 507efe434..000000000 --- a/wrappers/wrappers/fastq-dump/wrapper.py +++ /dev/null @@ -1,41 +0,0 @@ -from snakemake import shell -output = snakemake.output -log = snakemake.log - -srr = snakemake.params.sampletable.loc[snakemake.wildcards.sample, 'Run'] - -if hasattr(snakemake.params, "limit"): - limit = f'-X {snakemake.params.limit}' -else: - limit = "" - -# Two different paths depending on the layout. In both cases, we -# want to avoid creating the final output until the very end, to -# avoid incomplete downloads. -if snakemake.params.is_paired: - # For PE we need to use --split-files, which also means using - # the slower --gzip - shell( - 'fastq-dump ' - '{srr} ' - '--gzip ' - '--split-files ' - '{limit} ' - '&> {log}' - ) - - # The filenames are predictable, so we can move them as needed. - shell('mv {srr}_1.fastq.gz {output[0]}') - shell('mv {srr}_2.fastq.gz {output[1]}') - -else: - # For SE, we can use the faster stdout | gzip, and move it - # directly when done. - shell( - 'fastq-dump ' - '{srr} ' - '-Z ' - '{limit} ' - '2> {log} | gzip -c > {output[0]}.tmp ' - '&& mv {output[0]}.tmp {output[0]} ' - ) diff --git a/wrappers/wrappers/fastq_screen/README.md b/wrappers/wrappers/fastq_screen/README.md deleted file mode 100644 index efd36a326..000000000 --- a/wrappers/wrappers/fastq_screen/README.md +++ /dev/null @@ -1,61 +0,0 @@ -# Wrapper for fastq_screen - -[`fastq_screen`](http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen) -screens a library of sequences in FASTQ format against a set of sequence -databases identifying the composition of the library and possible contaminants. - -Fastq screen uses a configuration file pointing to different database. For example: - -``` -DATABASE ecoli /data/Escherichia_coli/Bowtie2Index/genome BOWTIE2 -DATABASE hg19 /data/hg19/Bowtie2Index/genome BOWTIE2 -DATABASE mm10 /data/mm10/Bowtie2Index/genome BOWTIE2 -``` - -This configuration file is automatically generated by the wrapper based on -which indexes are given as inputs (see **Example**). Currently the wrapper only -supports bowtie2 and defaults to using a subset of 100000 reads. Which can be -overridden using `params.subset` setting. Furthermore, `params.extra` is -passed arguments verbatim to `fastq_screen`, for example -`extra="--illumina1_3"` or `extra="--bowtie2 '--trim5=8'"`. - -Note that `fastq_screen` hard-codes the output filenames. This wrapper moves -the hard-coded output files to those specified by the rule. Currently the -wrapper does not save png's generated by fastq screen. It does, however, support -the contextual saving of tagged and/or filtered output fastqs from fastq_screen. -If desired, combinations of "--tag" and/or "--filter [filter_codes]" should be -provided to the run via the "extra" parameter in the Snakemake rule. The output -fastqs will *not* be tracked by Snakemake. They will be named as -"{snakemake.output.txt}.tagged.fastq.gz" or "{snakemake.output.txt}.tagged_filter.fastq.gz" -respectively. - -## Example: - -``` -rule fastq_screen: - input: - fastq="samples/{sample}.fastq.gz", - ecoli=["/data/Escherichia_coli/Bowtie2Index/genome.1.bt2", "/data/Escherichia_coli/Bowtie2Index/genome.2.bt2"], - hg19=["/data/hg19/Bowtie2Index/genome.1.bt2", "/data/hg19/Bowtie2Index/genome.2.bt2"], - mm10=["/data/mm10/Bowtie2Index/genome.1.bt2", "/data/mm10/Bowtie2Index/genome.2.bt2"] - output: - txt="qc/{sample}.fastq_screen.txt" - params: - subset=100000, - aligner='bowtie2' - threads: 8 - wrapper: - "file:wrapper" -``` - -## Input - -* `fastq` is a FASTQ file, gzipped or not. - -* Additional arguments are used as labels and their values will be used to - generate database location. - -## Output - -`txt`: a text file containing the fraction of reads mapping to each provided -index diff --git a/wrappers/wrappers/fastq_screen/environment.yaml b/wrappers/wrappers/fastq_screen/environment.yaml deleted file mode 100644 index 360a727c3..000000000 --- a/wrappers/wrappers/fastq_screen/environment.yaml +++ /dev/null @@ -1,7 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - python=3 - - fastq-screen - - bowtie2 diff --git a/wrappers/wrappers/fastq_screen/wrapper.py b/wrappers/wrappers/fastq_screen/wrapper.py deleted file mode 100644 index 9b262cc11..000000000 --- a/wrappers/wrappers/fastq_screen/wrapper.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -from snakemake.shell import shell -import sys -sys.path.append(os.path.abspath('../..')) -from lib import aligners -import tempfile - -__author__ = "Ryan Dale" -__copyright__ = "Copyright 2016, Ryan Dale" -__email__ = "dalerr@niddk.nih.gov" -__license__ = "MIT" - -# Pull in parameters -extra = snakemake.params.get('extra', '') -aligner = snakemake.params.get('aligner', 'bowtie2') -subset = snakemake.params.get('subset', 100000) - -if aligner == 'bowtie2': - parse_index = aligners.prefix_from_bowtie2_index - -# Make log -log = snakemake.log_fmt_shell() - -# snakemake.params.fastq_screen_config can be either a dict or a string. If -# string, interpret as a filename pointing to the fastq_screen config file. -# Otherwise, create a new tempfile out of the contents of the dict: - -tmp = tempfile.NamedTemporaryFile(delete=False).name -with open(tmp, 'w') as fout: - for k, v in snakemake.input.items(): - if k != 'fastq': - label = k - if isinstance(v, str): - v = [v] - index = parse_index(v) - fout.write( - '\t'.join(['DATABASE', label, index, aligner.upper()]) + '\n') - config_file = tmp - -# fastq_screen hard-codes filenames according to this prefix. We will send -# hard-coded output to a temp dir, and then move them later. -tempdir = tempfile.mkdtemp() - -# Note that we assume only R1 is coming in. -prefix = os.path.basename(snakemake.input.fastq[0].split('.fastq')[0]) - -shell( - "fastq_screen --outdir {tempdir} " - "--force " - "--aligner {aligner} " - "--conf {config_file} " - "--subset {subset} " - "--threads {snakemake.threads} " - "{extra} " - "{snakemake.input.fastq} " - "{log}" -) - -# Move output to the filenames specified by the rule -shell("cp {tempdir}/{prefix}_screen.txt {snakemake.output.txt}") - -# Check for the output of the --tag option to fastq_screen -if os.path.isfile("{tempdir}/{prefix}.tagged.fastq.gz"): - shell("cp {tempdir}/{prefix}.tagged.fastq.gz {snakemake.output.txt}.tagged.fastq.gz") - -# Check for the output of the --filter XXXXXX option to fastq_screen -if os.path.isfile("{tempdir}/{prefix}.tagged_filter.fastq.gz"): - shell("cp {tempdir}/{prefix}.tagged_filter.fastq.gz {snakemake.output.txt}.tagged_filter.fastq.gz") - -# Clean up temp -shell("rm -r {tempdir}") -shell("rm {tmp}") diff --git a/wrappers/wrappers/fastqc/README.md b/wrappers/wrappers/fastqc/README.md deleted file mode 100644 index 678bf9be0..000000000 --- a/wrappers/wrappers/fastqc/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Wrapper for FastQC - -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) performs -quality control for high-throughput sequencing data. - -## Input -FASTQ, SAM, or BAM file. FastQC will auto-detect, but you can also use -`--format` and one of bam, sam, bam_mapped, sam_mapped or fastq in the -params.extra field (see example). - -## Output -- html: an html file containing the report for the sample -- zip: a zip file containing the images and text file of results - -## Threads -Supports threads, passed in as the `--threads` arg - -## Params -Additional parameters can be passed to FastQC verbatim by supplying a string in params.extra. - -# Example - -``` -rule fastqc: - input: 'samples/{sample}.fastq' - output: - html='samples/{sample}.fastqc.html', - zip='samples/{sample}.fastqc.zip' - params: extra="--contaminants adapters.tsv --format fastq" - wrapper: - "file://path/to/fastqc" -``` diff --git a/wrappers/wrappers/fastqc/environment.yaml b/wrappers/wrappers/fastqc/environment.yaml deleted file mode 100644 index 3d0dee627..000000000 --- a/wrappers/wrappers/fastqc/environment.yaml +++ /dev/null @@ -1,9 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - # for fastqc running in minimal containers, which complain about missing - # fonts - - openjdk >=8.0.144 - - font-ttf-dejavu-sans-mono - - fastqc diff --git a/wrappers/wrappers/fastqc/wrapper.py b/wrappers/wrappers/fastqc/wrapper.py deleted file mode 100644 index 32032bbd7..000000000 --- a/wrappers/wrappers/fastqc/wrapper.py +++ /dev/null @@ -1,48 +0,0 @@ -__author__ = "Ryan Dale" -__copyright__ = "Copyright 2016, Ryan Dale" -__email__ = "dalerr@niddk.nih.gov" -__license__ = "MIT" - -import os -from snakemake.shell import shell -from snakemake.utils import makedirs - -# fastqc creates a zip file and an html file but the filename is hard-coded by -# replacing fastq|fastq.gz|fq|fq.gz|bam with _fastqc.zip|_fastqc.html in the -# input file's basename. -# -# So we identify that file and move it to the expected output after fastqc is -# done. - -outfile = os.path.basename(snakemake.input[0]) -outdir = os.path.dirname(snakemake.output.html) -if outdir == '': - outdir = '.' - -strip = ['.fastq', '.fq', '.gz', '.bam'] -for s in strip: - outfile = outfile.replace(s, '') -out_zip = os.path.join(outdir, outfile + '_fastqc.zip') -out_html = os.path.join(outdir, outfile + '_fastqc.html') - -extra = snakemake.params.get('extra', '') -log = snakemake.log_fmt_shell() - -shell( - 'fastqc ' - '--threads {snakemake.threads} ' - '--noextract ' - '--quiet ' - '--outdir {outdir} ' - '{extra} ' - '{snakemake.input} ' - '{log} ' -) - -def same_file(x, y): - return os.path.abspath(x) == os.path.abspath(y) - -if not same_file(out_zip,snakemake.output.zip): - shell('mv {out_zip} {snakemake.output.zip}') -if not same_file(out_html, snakemake.output.html): - shell('mv {out_html} {snakemake.output.html}') diff --git a/wrappers/wrappers/macs2/callpeak/README.md b/wrappers/wrappers/macs2/callpeak/README.md deleted file mode 100644 index bafad8381..000000000 --- a/wrappers/wrappers/macs2/callpeak/README.md +++ /dev/null @@ -1,61 +0,0 @@ -# MACS2 - -Wraps the `macs2 callpeak` subprogram to call ChIP-seq peaks on input BAM -files. - -## Examples - -Minimal usage. MACS2 outputs a whole directory; this directory is the dirname -of `output.bed`. Note the specification of the genome size in `params.extra`. - -```python -rule macs2: - input: - treatment='ip.bam', - control='input.bam', - chromsizes='dm6.chromsizes' - output: - bed='out/peaks.bed' - extra: '-g dm' - wrapper: - 'file://path/to/wrapper' -``` - -MACS2 supports multiple ip and input samples (they are concatenated). This also -shows broad peak-calling, asks MACS2 to create scaled bedgraphs, and adds them as -output files so downstream rules can use them: - -```python -rule macs2: - input: - treatment=['ip1.bam', 'ip2.bam'], - control=['input1.bam', 'input2.bam'], - chromsizes='dm6.chromsizes' - output: - bed='out/peaks.bed' - params: extra='-g dm --bdg --SPMR --broad' - wrapper: - 'file://path/to/wrapper' -``` - -## Input - -`treatment`: single BAM or list of BAMs for IP - -`control`: single BAM or list of BAMs for input - -`chromsizes`: Chromsizes table, used to ensure peak boundaries do not extend -outside of chromosome limits. - -## Output - -`bed`: BED file of called peaks. This is symlinked from the -`*_peaks.narrowPeak` or `*_peaks.broadPeak` file created by MACS2. - -Other files are created, these can be added as additional named outputs for use -by downstream rules, however the wrapper only pays attention to -`snakemake.output.bed`. - - -## Params -Additional params in `extra` will be passed verbatim to `macs2 callpeak`. diff --git a/wrappers/wrappers/macs2/callpeak/environment.yaml b/wrappers/wrappers/macs2/callpeak/environment.yaml deleted file mode 100644 index 51d042703..000000000 --- a/wrappers/wrappers/macs2/callpeak/environment.yaml +++ /dev/null @@ -1,8 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - macs2 - - numpy - - bedtools - - ucsc-bedsort=377 diff --git a/wrappers/wrappers/sicer/README.md b/wrappers/wrappers/sicer/README.md deleted file mode 100644 index 9be29101d..000000000 --- a/wrappers/wrappers/sicer/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# SICER - -Wraps the `sicer` program to call ChIP-seq peaks on input BED files. - -## Examples - -Minimal usage. SICER is the best operating piece of hot garbage you'll ever find. -It has a completely fixed set of input parameters it requires, hard-coded genome -data in SICER/lib/GenomeData.py (submit bug report in bioconda if you need -additions), and it can't be run from the same directory at the same time due to -hard coded output filenames. It's a proper mess boss. - -```python -rule sicer: - input: - ip='ip.bed', - control='input.bed', - redundancy_threshold=1, - window_size=200, - fragment_size=150, - effective_genome_fraction=0.75, - gap_size=600, - fdr=0.01 - output: - bed='out/peaks.bed' - wrapper: - 'file://path/to/wrapper' -``` - - -## Input - -`ip`: single BED for IP - -`control`: single BED for input - -`redundancy_threshold`: cutoff count above which duplicates are removed - -`window_size`: SICER resolution; 200 recommended for histones - -`fragment_size`: twice the shift from the beginning to the center of a read - -`effective_genome_fraction`: percentage of mappable genome; only set it here if you want to override the genome build in config.yaml - -`gap_size`: nonnegative integer multiple of window size. used to merge contiguous regions (higher means more liberal merging). - -`fdr`: FDR cutoff for calling significant regions. - -## Output - -`bed`: BED file of called peaks. This is a delicately processed version of `*island.bed` from SICER. - -Other files are created, these can be added as additional named outputs for use -by downstream rules, however the wrapper only pays attention to -`snakemake.output.bed`. - - -## Params -Do not use `extra` for this rule. diff --git a/wrappers/wrappers/sicer/environment.yaml b/wrappers/wrappers/sicer/environment.yaml deleted file mode 100644 index 44cd4d766..000000000 --- a/wrappers/wrappers/sicer/environment.yaml +++ /dev/null @@ -1,10 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - python=2 - - numpy - - sicer - - bedtools - - ucsc-bedsort=377 - - ucsc-wigtobigwig=377 diff --git a/wrappers/wrappers/sicer/wrapper.py b/wrappers/wrappers/sicer/wrapper.py deleted file mode 100644 index 7fd29a9ed..000000000 --- a/wrappers/wrappers/sicer/wrapper.py +++ /dev/null @@ -1,147 +0,0 @@ -import tempfile -import os -import glob -from snakemake import shell - -logfile = None - -# as SICER's interface is rather strict, this wrapper enforces named variables -# instead of 'extra' arbitrary string - -def get_value(key, key2=None): - """ - Get the value from params.block if it exists, otherwise from params. - - If key2 is not None, it's a different key to extract from the same params.block. - - Raises ValueError if nothing is configured. - """ - if key2 is None: - key2 = key - val = snakemake.params.block.get(key, snakemake.params.get(key)) - else: - val = snakemake.params.block.get(key, snakemake.params.block.get(key2)) - - if val is None: - raise ValueError( - "SICER requires the specification of '{0}'".format(key)) - return val - -redundancy_threshold = get_value('redundancy_threshold') -window_size = get_value('window_size') -fragment_size = get_value('fragment_size') -effective_genome_fraction = get_value('effective_genome_fraction', 'reference_effective_genome_fraction') -gap_size = get_value('gap_size') -fdr = get_value('fdr') -genome_build = get_value('genome_build', 'reference_genome_build') - -outdir, basebed = os.path.split(snakemake.output.bed) -label = snakemake.params.block['label'] - -tmpdir = tempfile.mkdtemp() -cwd = os.getcwd() - -# SICER expects bed input format, not bam as in other peak callers -shell( - 'bamToBed -i {snakemake.input.ip} > {tmpdir}/ip.bed ; ' - 'bamToBed -i {snakemake.input.control} > {tmpdir}/in.bed ' -) - -# SICER emits a single hard-coded file that does not respect output directory. -# So move each run into its own temp directory to avoid collisions with -# other processes. -os.chdir(tmpdir) - -shell( - # there is a CI-specific bug, in which the python symlink is not correctly resolved to python2.7; - # so as a really desperate hack, modify SICER's python calls to directly touch 2.7 - """sed 's/^python/$CONDA_PREFIX\/bin\/python2.7/' """ - """$CONDA_PREFIX/share/sicer*/SICER.sh > {tmpdir}/SICER.sh && chmod u+x {tmpdir}/SICER.sh """ -) -shell( - # run SICER - """{tmpdir}/SICER.sh {tmpdir} ip.bed in.bed {tmpdir} """ - """{genome_build} {redundancy_threshold} {window_size} """ - """{fragment_size} {effective_genome_fraction} {gap_size} {fdr} > tmp.output 2>&1 """ -) - -# Move back once the run is complete. -os.chdir(cwd) - -# one of the results files gets converted to the broadPeak format ala macs -resultsfile = glob.glob(os.path.join(tmpdir, '*-islands-summary-FDR*')) -if len(resultsfile) == 1: - hit = resultsfile[0] - basehit = os.path.basename(resultsfile[0]) -elif len(resultsfile) > 1: - raise ValueError( - "Multiple islands-summary-FDR files found in {1}: {0}" - .format(os.listdir(tmpdir), tmpdir) - ) -else: - raise ValueError("No islands-summary-FDR file found in {1}: {0}".format(os.listdir(tmpdir), tmpdir)) - -# "summary graph for [the run] in bedGraph format" -summary_graph = glob.glob(os.path.join(tmpdir, '*-W{0}.graph*'.format(window_size))) -if len(summary_graph) == 1: - summary_graph = summary_graph[0] -else: - raise ValueError("SICER graph output file not found") - -# the bedGraph file above, normalized by library size per million, in wig format -normalized_prefilter_wig = glob.glob(os.path.join(tmpdir, '*-W{0}-normalized.wig'.format(window_size))) -if len(normalized_prefilter_wig) == 1: - normalized_prefilter_wig = normalized_prefilter_wig[0] -else: - raise ValueError("SICER normalized prefilter wig file not found") - -# "summary of all candidate islands with their statistical significance -candidate_islands = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-islands-summary'.format(window_size, gap_size))) -if len(candidate_islands) == 1: - candidate_islands = candidate_islands[0] -else: - raise ValueError("SICER candidate islands file not found") - -# "delineation of significant islands" -significant_islands = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-FDR*-island.bed'.format(window_size, gap_size))) -if len(significant_islands) == 1: - significant_islands = significant_islands[0] -else: - raise ValueError("SICER significant islands file not found") - -# "library of raw redundancy-removed reads on significant islands -redundancy_removed = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-FDR*-islandfiltered.bed'.format(window_size, gap_size))) -if len(redundancy_removed) == 1: - redundancy_removed = redundancy_removed[0] -else: - raise ValueError("SICER redundancy removed library file not found") - -# "wig file for the island-filtered redundancy-removed reads -normalized_postfilter_wig = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-FDR*-islandfiltered-normalized.wig'.format(window_size, gap_size))) -if len(normalized_postfilter_wig) == 1: - normalized_postfilter_wig = normalized_postfilter_wig[0] -else: - raise ValueError("SICER normalized postfilter wig file not found") - -shell( - "export LC_COLLATE=C; " - # format the output in broadPeak format - # note that SICER can emit p-values of 0 and in that case this file will contain "inf" entries - """awk -F"\\t" -v lab={label} """ - """'{{printf("%s\\t%d\\t%d\\t%s_peak_%d\\t%d\\t.\\t%g\\t%g\\t%g\\n", $1, """ - """$2, $3-1, lab, NR, -10*log($6)/log(10), $7, -log($6)/log(10), -log($8)/log(10))}}' """ - "{hit} > {snakemake.output.bed}.tmp && " - # sort the bed file, just to be sure - "bedSort {snakemake.output.bed}.tmp {snakemake.output.bed} && " - # rename the assorted output files - "mv {resultsfile} {snakemake.output.bed}-islands-summary-significant && " - "mv {summary_graph} {snakemake.output.bed}.graph && " - "wigToBigWig {normalized_prefilter_wig} {snakemake.input.chromsizes} {snakemake.output.bed}-normalized-prefilter.bigWig && " - "wigToBigWig {normalized_postfilter_wig} {snakemake.input.chromsizes} {snakemake.output.bed}-normalized-postfilter.bigWig && " - "mv {candidate_islands} {snakemake.output.bed}-islands-summary && " - "mv {significant_islands} {snakemake.output.bed}-island.bed && " - "mv {redundancy_removed} {snakemake.output.bed}-islandfiltered.bed && " - "mv {tmpdir}/tmp.output {snakemake.output.bed}.log && " - # clean up the temp directory - "rm {snakemake.output.bed}.tmp && rm -Rf {tmpdir}" -) diff --git a/wrappers/wrappers/spp/README.md b/wrappers/wrappers/spp/README.md deleted file mode 100644 index a8eb7c439..000000000 --- a/wrappers/wrappers/spp/README.md +++ /dev/null @@ -1,175 +0,0 @@ -# spp - -Wraps the [`spp`](http://compbio.med.harvard.edu/Supplements/ChIP-seq/) peak-caller. - -This is a rather complicated wrapper. See input and output sections below for -details. - - -## Examples - -Minimal usage: - -```python -rule spp: - input: - ip="ip.bam", - control="control.bam", - chromsizes='dm6.chromsizes' - output: "peaks.bed" - wrapper: - 'file://path/to/wrapper' -``` - -Specify parameters (see below for options): - - -```python -rule spp: - input: - ip="ip.bam", - control="control.bam", - chromsizes='dm6.chromsizes' - output: "peaks.bed" - params: block={'fdr': 0.1} - - wrapper: - 'file://path/to/wrapper' -``` - -Specify additional output files: - -```python -rule spp: - input: - ip="ip.bam", - control="control.bam", - chromsizes='dm6.chromsizes' - output: - bed="peaks.bed" - enrichment_estimates="enrichment_est.bedgraph", - smoothed_enrichment_mle="enrichment_mle.bedgraph", - rdata="image.RData" - params: block={'fdr': 0.1} - log: "spp.log" -``` - -The works, with multiple replicate BAMs to be merged, keeping the tempfiles, -increasing the memory available to MarkDuplicates, all the output files, -adjusting spp params, and using 8 threads for merging and duplicates removal: - - -```python -rule spp: - input: - ip=["ip.bam", "ip2.bam"], - control=["control.bam", "control2.bam", "control3.bam"], - chromsizes='dm6.chromsizes' - output: - bed="peaks.bed" - enrichment_estimates="enrichment_est.bedgraph", - smoothed_enrichment_mle="enrichment_mle.bedgraph", - rdata="image.RData" - log: 'spp.log' - threads: 8 - params: - block={'fdr': 0.1, 'bins': 10}, - java_args='-Xmx64g' - keep_tempfiles=True - log: "spp.log" -``` - -## Input - -`ip`, `control`: BAM files. Duplicates should already be removed. - -`chromsizes`: Chromsizes table, used to ensure peak boundaries do not extend -outside of chromosome limits. - -SPP itself only supports a single BAM file for IP and a single BAM file for -control. However, to support the common case of pooling replicates to gain -coverage, this wrapper does handle multiple BAMs. - -If more than one BAM is provided for either IP or control, the BAMs are merged -and then duplicates are removed from the merged file (to handle reads that -occur in both replicates, which would otherwise cause spp to complain) are -then removed using MarkDuplicates. This merged, deduped BAM is then provided to -SPP. - -The merged BAM, merged-and-deduped BAM, and metrics file (from MarkDuplicates) -are created as temp files. The temp filenames are indicated in the log. If you -need these for debugging, set `params: keep_tempfiles=True` to keep them. - -## Output - -The only required output is `bed`. Others, if specified, will trigger their -respective creation. - -`bed`: narrowPeak format. - -`smoothed_enrichment_mle`: BEDGRAPH file (even though SPP calls it a "WIG") of -smoothed enrichment using the `smoothed.enrichment.mle` method from SPP. -Optional, if not specified it will not be created. - -`enrichment_estimates`: BEDGRAPH file (even though SPP calls it a "WIG") of -enrichment estimates using the `get.conservative.fold.enrichment.profile` -function from SPP. Optional, if not specified will not be created. - -`rdata`: Saves an image of the workspace. Handy for debugging. Optional, if not -specified will not be created. - -An R script named after the BED file (`{snakemake.output.bed}.R`), will be -written to the output directory. This can be run from the same directory as the -snakefile was run from for debugging purposes. - -## Threads -We do not run SPP in parallel mode due to trouble with running the `snow` -library on clusters (it seems to crash unexpectedly and intermittently). -However, for multiple BAMs, we pass the threads to samtools and MarkDuplicates. - -## Params - -### wrapper params - -`keep_tempfiles`: bool; if True then tempfiles created by merging and deduping -replicate BAMs will be retained for debugging purposes. - -`java_args`: str; additional args provided to picard, e.g., `java_args="-Xmx64g"` - -### spp params - -Since SPP doesn't have a command-line interface, we can't use the "extras=" -mechanism to pass params verbatim. Instead, the R script created by the wrapper -supports the following parameters, provided as keys to the `block` param to -make it easier to work with the chipseq config format. For example: - -```python -params: - block={'bins': 5, 'fdr': 0.1}, - java_args='-Xmx64g' -``` - -`srange`: tuple; controls the range of lags over which to calculate -cross-correlation. Default is `(50, 500)` - -`bins`: integer; controls how the binding characteristics will be binned. Default -is `5`. - -`tecfilter`: bool; passed to `find.binding.positions` function. Default is True; -set to False to prevent the exclusion of large regions with higher input than -expected. - -`remove_anomalies`: bool; enable/disable the remove.tag.anomalies step. Defualt -is False (do not remove anomalies). Setting to True can increase the time -dramatically. - -`fdr`: float; false discovery rate when calling peaks. Default is `0.05`. - -`whs`: int. window half-size. Used if the auto-calculated -`binding.characteristics` is NA. Default is `500`. - -`zthr`: float. Z threshold used when adding broad regions. Default is `3`. - -`bandwidth`: int. Bandwith for smoothing WIG file. Default is `200`. - -`step`: int; step size for smoothing WIG file. Default is `100`. diff --git a/wrappers/wrappers/spp/environment.yaml b/wrappers/wrappers/spp/environment.yaml deleted file mode 100644 index 42dd8086f..000000000 --- a/wrappers/wrappers/spp/environment.yaml +++ /dev/null @@ -1,11 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults - -dependencies: - - picard - - bedtools - - samtools - - r-spp - - r >=3.5.1 diff --git a/wrappers/wrappers/spp/wrapper.py b/wrappers/wrappers/spp/wrapper.py deleted file mode 100644 index 364c3ba10..000000000 --- a/wrappers/wrappers/spp/wrapper.py +++ /dev/null @@ -1,256 +0,0 @@ -from textwrap import dedent -import tempfile -from snakemake.shell import shell -log = snakemake.log_fmt_shell(append=True) - -# Since we'll be appending the output from multiple commands to the same log, -# we want to ensure that the provided log file is empty to start -if snakemake.log: - shell('cat /dev/null > {snakemake.log}') - -java_args = snakemake.params.get('java_args', '') -keep_tempfiles = snakemake.params.get('keep_tempfiles', False) - -registered_for_deletion = [ - snakemake.output.bed + '.tmp', - snakemake.output.bed + '.tmp.genome', -] - - -def merge_and_dedup(bams): - """ - spp only handles one replicate at a time. To support pooled samples, we - merge and remove duplicates, storing the result in a tempfile. - - If only one item is provided, return it immediately - """ - - if len(bams) == 1: - return bams - - merged = tempfile.NamedTemporaryFile(delete=False, prefix='merged', suffix='.bam').name - merged_and_deduped = tempfile.NamedTemporaryFile(delete=False, prefix='merged_and_duped', suffix='.bam').name - metrics = tempfile.NamedTemporaryFile(delete=False, prefix='metrics', suffix='.txt').name - - shell('echo "tempfiles created by merge_and_dedup: {merged} {merged_and_deduped} {metrics}" {log}') - - if not keep_tempfiles: - registered_for_deletion.extend([merged, merged_and_deduped, metrics]) - - bams = ' '.join(bams) - shell( - 'samtools merge ' - '-f ' - '-@ {snakemake.threads} ' - '{merged} ' - '{bams} ' - '{log} ' - ) - shell( - 'picard ' - '{java_args} ' - 'MarkDuplicates ' - 'INPUT={merged} ' - 'OUTPUT={merged_and_deduped} ' - 'METRICS_FILE={metrics} ' - 'REMOVE_DUPLICATES=true ' - '{log} ' - ) - return merged_and_deduped - - -def Rbool(x): - """ - Convert to R boolean string used to fill in a template - """ - if x: - return 'TRUE' - return 'FALSE' - - -# ---------------------------------------------------------------------------- -# DEFAULTS -# -extra = snakemake.params.block.get('extra', {}) - -DEFAULTS = { - # srange controls the range of lags over which to calculate cross-correlation - 'srange': (50, 500), - # bins controls how the binding characteristics will be binned - 'bins': 5, - # enable/disable the remove.tag.anomalies step - 'remove_anomalies': False, - # false discovery rate when calling peaks - 'fdr': 0.05, - # window half-size. Used if binding.characteristics is NA. - 'whs': 500, - # Z threshold used when adding broad regions. - 'zthr': 3, - # bandwith for smoothing WIG file - 'bandwidth': 200, - # step for smoothing WIG file - 'step': 100, - # Set to False to disable the filtering of large regions with high input signal - 'tecfilter': True, -} - -params = {} -for k, v in DEFAULTS.items(): - v = extra.get(k, v) - if isinstance(v, bool): - v = Rbool(v) - params[k] = v - -# ---------------------------------------------------------------------------- - -# R_template is incrementally built up so that we can intersperse comments and -# to keep things better organized. It will be filled in with `**locals()` at -# the end. - -ip = merge_and_dedup(snakemake.input.ip) -control = merge_and_dedup(snakemake.input.control) - - -R_template = """ -library(spp) -chip.data <- read.bam.tags("{ip}") -input.data <- read.bam.tags("{control}") -""" - - -# -R_template += """ -for (chrom in names(chip.data$tags)){{ - if (length(chip.data$tags[[chrom]]) < 10){{ - print(paste("Chromosome", chrom, "has <10 reads; removing from analysis")) - chip.data$tags[[chrom]] <- NULL - chip.data$quality[[chrom]] <- NULL - input.data$tags[[chrom]] <- NULL - input.data$quality[[chrom]] <- NULL - }} -}} -""" - -# Use configured srange and bins, if provided. `accept.all.tags=TRUE` is -# hard-coded since we were getting errors if FALSE. -R_template += """ -binding.characteristics <- get.binding.characteristics( - chip.data, - srange=c({params[srange][0]}, {params[srange][1]}), - bin={params[bins]}, - accept.all.tags=TRUE, - remove.tag.anomalies={params[remove_anomalies]} -) -""" - -R_template += """ -# Extract info from binding characteristics -tag.shift <- round(binding.characteristics$peak$x/2) -detection.window.halfsize <- binding.characteristics$whs -if (!is.finite(detection.window.halfsize)){{ - detection.window.halfsize <- {params[whs]} -}} -""" - -R_template += """ -# Reset data to tags, and remove any chromosomes with no data. -# (tags is a list, names are chromosomes and values are integer vectors) - -chip.data <- chip.data$tags -input.data <- input.data$tags - -chip.data[sapply(chip.data, is.null)] <- NULL -input.data[sapply(input.data, is.null)] <- NULL -""" - - -if 'smoothed_enrichment_mle' in snakemake.output.keys(): - R_template += dedent(""" - smoothed.enrichment.estimate <- get.smoothed.enrichment.mle( - chip.data, - input.data, - bandwidth={params[bandwidth]}, - step={params[step]}, - tag.shift=tag.shift) - writewig( - smoothed.enrichment.estimate, - "{snakemake.output.smoothed_enrichment_mle}", - feature="" - ) - """) - -if 'enrichment_estimates' in snakemake.output.keys(): - R_template += dedent(""" - enrichment.estimates <- get.conservative.fold.enrichment.profile( - chip.data, input.data, fws=500, step=100, alpha=0.01 - ) - writewig(enrichment.estimates, "{snakemake.output.enrichment_estimates}", feature="") - rm(enrichment.estimates) - """) - -R_template += """ -# Get peaks -bp <- find.binding.positions( - signal.data=chip.data, - control.data=input.data, - fdr={params[fdr]}, - whs=detection.window.halfsize, - tec.filter={params[tecfilter]} -) -""" - -R_template += """ -# Add broad regions to peaks -bp <- add.broad.peak.regions( - chip.data, - input.data, - bp, - window.size=detection.window.halfsize, - z.thr={params[zthr]} -) -write.narrowpeak.binding(bp, "{snakemake.output.bed}.tmp") -""" - -# Save image for later introspection or debugging -if 'rdata' in snakemake.output.keys(): - R_template += dedent(""" - save.image("{snakemake.output.rdata}") - """) - -# write the filled-in template to the output directory for later debugging -script_filename = snakemake.output.bed + '.R' -with open(script_filename, 'w') as fout: - fout.write(R_template.format(**locals())) - -# Run it -shell('Rscript {script_filename} {log}') - -# Fix the output file so that it doesn't have negative numbers and so it fits -# inside the genome -shell( - """awk -F "\\t" '{{OFS="\\t"; print $1, "0", $2}}' """ - "{snakemake.input.chromsizes} " - "> {snakemake.output.bed}.tmp.genome" -) -shell( - "sort -k1,1 -k2,2n {snakemake.output.bed}.tmp | " - """awk -F "\\t" '{{OFS="\\t"; if (($2>0) && ($3>0)) print $0}}' | """ - "bedtools intersect -a - -b {snakemake.output.bed}.tmp.genome > {snakemake.output.bed}" -) - -# SPP's writewig() adds a header and is space-separated, so this turns it into -# a proper bedGraph file ready for conversion to bigwig. -if 'enrichment_estimates' in snakemake.output.keys(): - shell('grep -v "track" {snakemake.output.enrichment_estimates} ' - '| sed "s/ /\\t/g" > {snakemake.output.enrichment_estimates}.tmp ' - '&& mv {snakemake.output.enrichment_estimates}.tmp ' - '{snakemake.output.enrichment_estimates}') - -if 'smoothed_enrichment_mle' in snakemake.output.keys(): - shell('grep -v "track" {snakemake.output.smoothed_enrichment_mle} ' - '| sed "s/ /\\t/g" > {snakemake.output.smoothed_enrichment_mle}.tmp ' - '&& mv {snakemake.output.smoothed_enrichment_mle}.tmp ' - '{snakemake.output.smoothed_enrichment_mle}') - -for fn in registered_for_deletion: - shell('rm -v {fn} {log}')