diff --git a/.circleci/config.yml b/.circleci/config.yml index 5f12fcb0..ea7c773f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -74,6 +74,7 @@ variables: cd workflows/chipseq source activate lcdb-wf-test ./run_test.sh --use-conda -j2 -k -p -r + ./run_test.sh --report python chipseq_trackhub.py config/config.yaml config/hub_config.yaml chipseq-regression-step: &chipseq-regression-step @@ -103,6 +104,7 @@ variables: cd workflows/rnaseq source activate lcdb-wf-test ./run_test.sh --use-conda -j2 -k -p -r + ./run_test.sh --report python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml rnaseq-star-step: &rnaseq-star-step diff --git a/include/WRAPPER_SLURM b/include/WRAPPER_SLURM index 522af2cf..28fa899f 100755 --- a/include/WRAPPER_SLURM +++ b/include/WRAPPER_SLURM @@ -21,6 +21,7 @@ if [[ ! -e logs ]]; then mkdir -p logs; fi --use-conda \ --configfile config/config.yaml \ --latency-wait=300 \ + --report report.html ) > "Snakefile.log" 2>&1 SNAKE_PID=$! diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py index 7d986b79..c6f6a299 100644 --- a/lib/patterns_targets.py +++ b/lib/patterns_targets.py @@ -9,6 +9,7 @@ from . import common from . import chipseq from . import helpers +from . import utils HERE = os.path.abspath(os.path.dirname(__file__)) @@ -61,8 +62,86 @@ def __init__(self, config, patterns, workdir=None): self.samples, self.sampletable = common.get_sampletable(self.config) self.refdict, self.conversion_kwargs = common.references_dict(self.config) self.organism = self.config['organism'] - self.patterns = yaml.load(open(patterns), Loader=yaml.FullLoader) - self.is_paired = helpers.detect_layout(self.sampletable) == 'PE' + + # Patterns have one of three structures: + # + # Structure 1 (simple): + # + # salmon: + # pattern: 'data/rnaseq_samples/{sample}/{sample}.salmon/{sample}_quant.sf' + # description: 'Transcripts quantification using Salmon' + # + # Structure 2 (nested 1 deep): + # + # rseqc: + # bam_stat: + # pattern: 'data/rnaseq_samples/{sample}/rseqc/{sample}_bam_stat.txt' + # description: 'RNAseq quality control analysis' + # infer_experiment: + # pattern: 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt' + # description: 'Infer layout and strandedness of experiment' + # + # + # Structure 3 (nested 2 deep): + # + # patterns_by_peaks: + # peaks: + # macs2: + # pattern: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed' + # description: 'BED file of peaks from macs2 peak-caller' + # + # + # Our job here is to create three objects: + # + # patterns = { + # 'salmon': 'data/rnaseq_samples/{sample}/{sample}.salmon/{sample}_quant.sf', + # 'rseqc': { + # 'bam_stat': 'data/rnaseq_samples/{sample}/rseqc/{sample}_bam_stat.txt', + # 'infer_experiment': 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt', + # }, + # 'patterns_by_peaks': { + # 'peaks': { + # 'macs2': 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed' + # } + # } + # } + # + # rst_files = { + # 'salmon': 'reports/data/rnaseq_samples/sample/sample.salmon/sample_quant.sf.rst', + # 'rseqc': { + # 'bam_stat': 'reports/data/rnaseq_samples/sample/rseqc/sample_bam_stat.txt', + # 'infer_experiment': 'reports/data/rnaseq_samples/sample/rseqc/sample_infer_experiment.txt.rst', + # }, + # 'patterns_by_peaks': { + # 'peaks': { + # 'macs2': 'reports/data/chipseq_peaks/macs2/macs2_run/peaks.bed.rst' + # } + # } + # + # descriptions = { + # 'salmon': 'Transcripts quantification using Salmon' + # 'rseqc': { + # 'bam_stat': 'RNAseq quality control analysis', + # 'infer_experiment': 'Infer layout and strandedness of experiment', + # }, + # 'patterns_by_peaks': { + # 'peaks': { + # 'macs2': 'BED file of peaks from macs2 peak-caller' + # } + # } + # } + # + # + + + loaded_patterns = yaml.load(open(patterns), Loader=yaml.FullLoader) + + self._loaded_patterns = loaded_patterns + self.patterns = utils.extract_nested(loaded_patterns, "pattern") + self.descriptions = utils.extract_nested(loaded_patterns, "description") + self.rst_files = utils.map_nested_dicts(self.patterns, utils.pattern_to_rst_file) + + self.is_paired = helpers.detect_layout(self.sampletable) == "PE" if self.is_paired: self.n = [1, 2] else: @@ -92,9 +171,14 @@ def __init__(self, config, patterns, workdir=None): SeqConfig.__init__(self, config, patterns, workdir) self.fill = dict(sample=self.samples, n=self.n) + + # The merged bigwigs have different placeholders and therefore must be + # filled separately. They also only should be included if + # merged_bigwigs have been configured. self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None) self.targets = helpers.fill_patterns(self.patterns, self.fill, zip) + # Then the aggregation if self.patterns_by_aggregation is not None and 'merged_bigwigs' in self.config: self.fill_by_aggregation = dict( @@ -107,6 +191,9 @@ def __init__(self, config, patterns, workdir=None): self.targets.update(self.targets_by_aggregation) self.patterns.update(self.patterns_by_aggregation) + self.rst_files.update(self.rst_files.pop('patterns_by_aggregate')) + self.descriptions.update(self.descriptions.pop('patterns_by_aggregate')) + class ChIPSeqConfig(SeqConfig): def __init__(self, config, patterns, workdir=None): @@ -156,6 +243,8 @@ def __init__(self, config, patterns, workdir=None): self.targets.update(self.targets_by_sample) self.patterns.update(self.patterns_by_sample) + self.descriptions.update(self.descriptions['patterns_by_sample']) + self.rst_files.update(self.rst_files['patterns_by_sample']) # Then the aggregation... self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None) @@ -170,6 +259,9 @@ def __init__(self, config, patterns, workdir=None): self.targets.update(self.targets_by_aggregation) self.patterns.update(self.patterns_by_aggregation) + self.rst_files.update(self.rst_files.pop('patterns_by_aggregate')) + self.descriptions.update(self.descriptions.pop('patterns_by_aggregate')) + # Then the peaks... # # Note: when adding support for new peak callers, add them here. @@ -225,3 +317,5 @@ def __init__(self, config, patterns, workdir=None): self.targets.update(self.targets_for_peaks) self.patterns.update(self.patterns_by_peaks) + self.descriptions.update(self.descriptions['patterns_by_peaks']) + self.rst_files.update(self.rst_files['patterns_by_peaks']) diff --git a/lib/utils.py b/lib/utils.py index 97e352c1..b41229d9 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1,8 +1,21 @@ import os import contextlib import collections -from collections.abc import Iterable +from collections.abc import Iterable, Mapping from snakemake.shell import shell +from snakemake.io import expand + + +def wrapper_for(path): + return 'file:' + os.path.join('../..','wrappers', 'wrappers', path) + + +def render_r1_r2(pattern): + return expand(pattern, sample='{sample}', n=[1,2]) + + +def r1_only(pattern): + return expand(pattern, sample='{sample}', n=1) @contextlib.contextmanager @@ -50,24 +63,130 @@ def gen(): return results -def test_flatten(): - assert sorted(flatten({ - 'a': { - 'b': { - 'c': ['a', 'b', 'c'], - }, - }, - 'x': ['e', 'f', 'g'], - 'y': { - 'z': 'd' - }, - })) == ['a', 'b', 'c', 'd', 'e', 'f', 'g'] +def map_nested_dicts(d, func, stop_condition=None): + """ + Apply `func` to all values of a nested dictionary - assert flatten('a', True) == 'a' - assert flatten(['a'], True) == 'a' - assert flatten('a') == ['a'] - assert flatten(['a']) == ['a'] + Parameters + ---------- + d : dict + + func : callable + Function to apply to values of d, or, if `stop_condition` is provided, + function to apply to remainder when `stop_condition(d)` is True. + + stop_condition : callable + Mechanism for stopping recursion at a particular level and sending the + results at that point to `func`. Function should accept a dict as its + only input argument. + + Examples + -------- + + Convert leaf values into boolean indicating if they are less than three: + + >>> d = {'a': {'b': {'target': 1, 'nontarget': 2}}, 'c': {'target': 3}} + >>> res = map_nested_dicts(d, lambda x: x < 3) + >>> assert res == {'a': {'b': {'target': True, 'nontarget': True}}, 'c': {'target': False}} + + + This function will sum values of provided dictionaries + + >>> def sum_values(x): + ... if isinstance(x, Mapping): + ... return sum(x.values()) + Since we don't specify a stopping condition which would send a dict to + `sum_values`, only the leaf integers get sent and the `sum_values` will + return None for those: + + >>> res = map_nested_dicts(d, sum_values) + >>> assert res == {'a': {'b': {'target': None, 'nontarget': None}}, 'c': {'target': None}}, res + + Here the stopping condition is whether "target" is in the keys, and if so, + the dict for which that is true is sent to `sum_values`: + + + >>> def stop1(x): + ... return isinstance(x, Mapping) and 'target' in x.keys() + + >>> res = map_nested_dicts(d, sum_values, stop_condition=stop1) + >>> assert res == {'a': {'b': 3}, 'c': 3}, res + + + Now if we only send dicts with "nontarget" in the keys, values in `b` are + summed but values in `c` are not because nothing there satisfied the + stopping condition: + + >>> def stop2(x): + ... return isinstance(x, Mapping) and 'nontarget' in x.keys() + + >>> res = map_nested_dicts(d, sum_values, stop_condition=stop2) + >>> assert res == {'a': {'b': 3}, 'c': {'target': None}}, res + + """ + if stop_condition and stop_condition(d): + return func(d) + if isinstance(d, Mapping): + return {k: map_nested_dicts(v, func, stop_condition) for k, v in d.items()} + else: + return func(d) + + + +def extract_nested(d, key): + """ + From a nested dict, keep all nesting the same EXCEPT for the leaf dicts, + from which only the provided key will be returned. + + Parameters + ---------- + d : dict + + key : str or hashable type + Key to extract. Effectively collapses leaf dictionaries containing this + key into just the value. + + Examples + -------- + + >>> d = {'a': {'b': {'target': 1, 'ignore': 2}}, 'c': {'target': 3}} + >>> result = extract_nested(d, 'target') + >>> assert result == {'a': {'b': 1}, 'c': 3}, result + """ + if not isinstance(d, Mapping): + return d + if key in d: + return d[key] + else: + return {k: extract_nested(v, key) for k,v in d.items()} + + +def pattern_to_rst_file(p): + """ + Convert filename pattern containing wildcards into an RST filename + """ + return os.path.join("reports", p.replace("{", "").replace("}", "")) + ".rst" + + +def write_out_rsts(full_patterns): + """ + Given the full patterns dictionary (containing patterns and descriptions), + write out a corresponding rst file containing the contents of the + description. + + Returns None; the side effect is to create all the necessary rst files. + """ + def stop_condition(x): + return isinstance(x, Mapping) and 'pattern' in x and 'description' in x + + def writer(x): + rst = pattern_to_rst_file(x['pattern']) + desc = x['description'] + with open(rst, 'w') as fout: + fout.write(desc + '\n') + + map_nested_dicts(full_patterns, func=writer, stop_condition=stop_condition) def updatecopy(orig, update_with, keys=None, override=False): """ @@ -186,3 +305,22 @@ def make_relative_symlink(target, linkname): if not os.path.exists(linkdir): shell('mkdir -p {linkdir}') shell('cd {linkdir}; ln -sf {relative_target} {linkbase}') + + +def test_flatten(): + assert sorted(flatten({ + 'a': { + 'b': { + 'c': ['a', 'b', 'c'], + }, + }, + 'x': ['e', 'f', 'g'], + 'y': { + 'z': 'd' + }, + })) == ['a', 'b', 'c', 'd', 'e', 'f', 'g'] + + assert flatten('a', True) == 'a' + assert flatten(['a'], True) == 'a' + assert flatten('a') == ['a'] + assert flatten(['a']) == ['a'] diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index c6ab2da7..30e1bc8d 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -9,6 +9,7 @@ import numpy as np import pybedtools from lib import common, cluster_specific, utils, helpers, aligners, chipseq from lib.patterns_targets import ChIPSeqConfig +from lib.utils import render_r1_r2, r1_only, wrapper_for # ---------------------------------------------------------------------------- # @@ -41,15 +42,14 @@ c = ChIPSeqConfig( wildcard_constraints: n = '[1,2]' - -def wrapper_for(path): - return 'file:' + os.path.join('../..','wrappers', 'wrappers', path) +report: "workflow.rst" # ---------------------------------------------------------------------------- # RULES # ---------------------------------------------------------------------------- # See "patterns and targets" in the documentation for what's going on here. + final_targets = utils.flatten(( c.targets['bam'], utils.flatten(c.targets['fastqc']), @@ -74,7 +74,19 @@ rule targets: """ Final targets to create """ - input: final_targets + input: final_targets, utils.flatten(c.rst_files) + + +rule report_rst: + """ + Create .rst containing captions for report + """ + input: + 'config/chipseq_patterns.yaml' + output: + list(set(utils.flatten(c.rst_files))) + run: + utils.write_out_rsts(c._loaded_patterns) if 'orig_filename' in c.sampletable.columns: @@ -166,8 +178,6 @@ if 'Run' in c.sampletable.columns and sum(c.sampletable['Run'].str.startswith('S ) -def render_r1_r2(pattern): - return expand(pattern, sample='{sample}', n=[1,2]) rule cutadapt: """ @@ -411,7 +421,7 @@ rule multiqc: ), config='config/multiqc_config.yaml' output: - c.targets['multiqc'] + report(c.targets['multiqc'], caption=c.rst_files['multiqc']) log: c.targets['multiqc'][0] + '.log' run: @@ -527,7 +537,7 @@ rule fingerprint: bais=lambda wc: expand(c.patterns['merged_techreps'] + '.bai', label=wc.ip_label), control_bais=lambda wc: expand(c.patterns['merged_techreps'] + '.bai', label=chipseq.merged_input_for_ip(c.sampletable, wc.ip_label)), output: - plot=c.patterns['fingerprint']['plot'], + plot=report(c.patterns['fingerprint']['plot']), raw_counts=c.patterns['fingerprint']['raw_counts'], metrics=c.patterns['fingerprint']['metrics'] threads: 8 @@ -569,7 +579,7 @@ rule sicer: ), chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], output: - bed=c.patterns['peaks']['sicer'] + bed=report(c.patterns['peaks']['sicer']) log: c.patterns['peaks']['sicer'] + '.log' params: @@ -594,7 +604,7 @@ rule macs2: ), chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], output: - bed=c.patterns['peaks']['macs2'] + bed=report(c.patterns['peaks']['macs2']) log: c.patterns['peaks']['macs2'] + '.log' params: @@ -620,7 +630,7 @@ rule spp: ), chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], output: - bed=c.patterns['peaks']['spp'], + bed=report(c.patterns['peaks']['spp']), enrichment_estimates=c.patterns['peaks']['spp'] + '.est.wig', smoothed_enrichment_mle=c.patterns['peaks']['spp'] + '.mle.wig', rdata=c.patterns['peaks']['spp'] + '.RData' @@ -713,7 +723,7 @@ rule plotcorrelation: input: c.targets['multibigwigsummary']['npz'] output: - heatmap=c.targets['plotcorrelation']['heatmap'], + heatmap=report(c.targets['plotcorrelation']['heatmap']), tab=c.targets['plotcorrelation']['tab'] shell: 'plotCorrelation ' diff --git a/workflows/chipseq/config/chipseq_patterns.yaml b/workflows/chipseq/config/chipseq_patterns.yaml index 94130b54..36b1d23c 100644 --- a/workflows/chipseq/config/chipseq_patterns.yaml +++ b/workflows/chipseq/config/chipseq_patterns.yaml @@ -1,56 +1,127 @@ patterns_by_sample: - fastq: 'data/chipseq_samples/{sample}/{sample}_R1.fastq.gz' - cutadapt: 'data/chipseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz' - bam: 'data/chipseq_samples/{sample}/{sample}.cutadapt.bam' + fastq: + pattern: 'data/chipseq_samples/{sample}/{sample}_R1.fastq.gz' + description: 'Original FASTQ file' + cutadapt: + pattern: 'data/chipseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz' + description: 'Trimmed reads from cutadapt' + + bam: + pattern: 'data/chipseq_samples/{sample}/{sample}.cutadapt.bam' + description: 'Sorted, aligned BAM' fastqc: - raw: 'data/chipseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip' - cutadapt: 'data/chipseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip' - bam: 'data/chipseq_samples/{sample}/fastqc/{sample}.cutadapt.unique.nodups.bam_fastqc.zip' + raw: + pattern: 'data/chipseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip' + description: 'Quality control analysis of raw sequence reads' + cutadapt: + pattern: 'data/chipseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip' + description: 'Quality control analysis of raw sequence reads post adaptor trimming' + bam: + pattern: 'data/chipseq_samples/{sample}/fastqc/{sample}.cutadapt.unique.nodups.bam_fastqc.zip' + description: 'Quality control analysis of aligned reads' libsizes: - fastq: 'data/chipseq_samples/{sample}/{sample}_R1.fastq.gz.libsize' - cutadapt: 'data/chipseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize' - bam: 'data/chipseq_samples/{sample}/{sample}.cutadapt.bam.libsize' - unique: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam.libsize' - nodups: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.libsize' - - fastq_screen: 'data/chipseq_samples/{sample}/{sample}.cutadapt.screen.txt' - libsizes_table: 'data/chipseq_aggregation/libsizes_table.tsv' - libsizes_yaml: 'data/chipseq_aggregation/libsizes_table_mqc.yaml' - multiqc: 'data/chipseq_aggregation/multiqc.html' - unique: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam' + fastq: + pattern: 'data/chipseq_samples/{sample}/{sample}_R1.fastq.gz.libsize' + description: 'Sample library size using raw sequence reads' + cutadapt: + pattern: 'data/chipseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize' + description: 'Sample library size using sequence reads post adaptor trimming' + bam: + pattern: 'data/chipseq_samples/{sample}/{sample}.cutadapt.bam.libsize' + description: 'Sample library size using aligned reads' + unique: + pattern: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam.libsize' + description: 'Sample library size using unique aligned reads' + nodups: + pattern: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.libsize' + description: 'Sample library size using unique, non-duplicate reads' + + fastq_screen: + pattern: 'data/chipseq_samples/{sample}/{sample}.cutadapt.screen.txt' + description: 'fastq screen statistics' + libsizes_table: + pattern: 'data/chipseq_aggregation/libsizes_table.tsv' + description: 'TSV of library sizes' + libsizes_yaml: + pattern: 'data/chipseq_aggregation/libsizes_table_mqc.yaml' + description: 'YAML file of library sizes, for MultiQC' + multiqc: + pattern: 'data/chipseq_aggregation/multiqc.html' + description: 'MultiQC output that aggregates many stages' + + unique: + pattern: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam' + description: 'BAM file with multimappers removed' markduplicates: - bam: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam' - metrics: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics' + bam: + pattern: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam' + description: 'BAM file with multimappers removed and duplicates removed' + + metrics: + pattern: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics' + description: 'Metrics file from MarkDuplicates' - merged_techreps: 'data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam' + merged_techreps: + pattern: 'data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam' + description: 'BAM file of merged technical replicates' - bigwig: 'data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig' + bigwig: + pattern: 'data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig' + description: 'bigwig of merged technical replicates' fingerprint: - plot: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.png' - raw_counts: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.tab' - metrics: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics' + plot: + pattern: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.png' + description: 'Fingerprint plot' + raw_counts: + pattern: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.tab' + description: 'TSV of results from fingerprint analysis' + metrics: + pattern: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics' + description: 'Metrics file from fingerprint analysis' multibigwigsummary: - npz: 'data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz' - tab: 'data/chipseq_aggregation/deeptools/multibigwigsummary.tab' + npz: + pattern: 'data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz' + description: 'Compressed numpy array from multibigwigsummary output' + tab: + pattern: 'data/chipseq_aggregation/deeptools/multibigwigsummary.tab' + description: 'TSV of multibigwigsummary results' plotcorrelation: - tab: 'data/chipseq_aggregation/deeptools/plotcorrelation.tab' - heatmap: 'data/chipseq_aggregation/deeptools/correlation_heatmap.png' + tab: + pattern: 'data/chipseq_aggregation/deeptools/plotcorrelation.tab' + description: 'TSV of correlation results' + heatmap: + pattern: 'data/chipseq_aggregation/deeptools/correlation_heatmap.png' + description: 'Heatmap of correlations between samples' patterns_by_peaks: peaks: - macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed' - spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bed' - sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bed' + macs2: + pattern: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed' + description: 'BED file of peaks from macs2 peak-caller' + spp: + pattern: 'data/chipseq_peaks/spp/{spp_run}/peaks.bed' + description: 'BED file of peaks from spp peak-caller' + sicer: + pattern: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bed' + description: 'BED file of domains from SICER domain caller' bigbed: - macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bigbed' - spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bigbed' - sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bigbed' + macs2: + pattern: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bigbed' + description: 'bigBed file of peaks from macs2 peak-caller' + spp: + pattern: 'data/chipseq_peaks/spp/{spp_run}/peaks.bigbed' + description: 'bigBed file of peaks from spp peak-caller' + sicer: + pattern: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bigbed' + description: 'bigBed file of domains from SICER domain caller' patterns_by_aggregate: - merged_bigwig: 'data/chipseq_aggregation/merged_bigwigs/{merged_bigwig_label}.bigwig' + merged_bigwig: + pattern: 'data/chipseq_aggregation/merged_bigwigs/{merged_bigwig_label}.bigwig' + description: 'Merged bigwigs as specified in config' diff --git a/workflows/chipseq/workflow.rst b/workflows/chipseq/workflow.rst new file mode 100644 index 00000000..0240ddcf --- /dev/null +++ b/workflows/chipseq/workflow.rst @@ -0,0 +1,4 @@ +lcdb-wf is a collection of snakemake workflows and tools for common high-throughput sequencing analysis, along with associated infrastructure. + +See docs at https://lcdb.github.io/lcdb-wf. + diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index e4bc5c50..14348303 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -8,6 +8,9 @@ import tempfile import pandas as pd from lib import common, cluster_specific, utils, helpers, aligners from lib.patterns_targets import RNASeqConfig +from lib.utils import render_r1_r2, r1_only, wrapper_for + +import copy # ---------------------------------------------------------------------------- # @@ -38,13 +41,11 @@ wildcard_constraints: n = '[1,2]' -def wrapper_for(path): - return 'file:' + os.path.join('../..','wrappers', 'wrappers', path) - # ---------------------------------------------------------------------------- # RULES # ---------------------------------------------------------------------------- +report: "workflow.rst" # See "patterns and targets" in the documentation for what's going on here. final_targets = utils.flatten(( @@ -71,17 +72,25 @@ if 'merged_bigwigs' in config: final_targets.extend(utils.flatten(c.targets['merged_bigwig'])) -def render_r1_r2(pattern, r1_only=False): - return expand(pattern, sample='{sample}', n=c.n) - -def r1_only(pattern): - return expand(pattern, sample='{sample}', n=1) - rule targets: """ Final targets to create """ - input: final_targets + input: + final_targets, utils.flatten(c.rst_files) + + +rule report_rst: + """ + Create .rst containing captions for report + """ + input: + 'config/rnaseq_patterns.yaml' + output: + utils.flatten(c.rst_files) + run: + utils.write_out_rsts(c._loaded_patterns) + if 'orig_filename' in c.sampletable.columns: @@ -168,7 +177,9 @@ rule cutadapt: input: fastq=render_r1_r2(c.patterns['fastq']) output: - fastq=render_r1_r2(c.patterns['cutadapt']) + fastq=report(render_r1_r2(c.patterns['cutadapt']), caption= + c.rst_files['cutadapt'], + category="fastq") log: render_r1_r2(c.patterns['cutadapt'])[0] + '.log' threads: 6 @@ -219,7 +230,7 @@ if config['aligner']['index'] == 'hisat2': Map reads with HISAT2 """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=render_r1_r2(c.patterns['cutadapt']), index=[c.refdict[c.organism][config['aligner']['tag']]['hisat2']] output: bam=c.patterns['bam'] @@ -259,7 +270,7 @@ if config['aligner']['index'] == 'star': Map reads with STAR """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=render_r1_r2(c.patterns['cutadapt']), index=[c.refdict[c.organism][config['aligner']['tag']]['star']] output: bam=c.patterns['bam'] @@ -298,7 +309,6 @@ if config['aligner']['index'] == 'star': '&& rm {prefix}Aligned.out.sam ' ) - rule rRNA: """ Map reads with bowtie2 to the rRNA reference @@ -340,7 +350,7 @@ rule fastq_count: input: fastq='{sample_dir}/{sample}/{sample}{suffix}.fastq.gz' output: - count='{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize' + count='{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize', shell: 'zcat {input} | echo $((`wc -l`/4)) > {output}' @@ -352,7 +362,8 @@ rule bam_count: input: bam='{sample_dir}/{sample}/{suffix}.bam' output: - count='{sample_dir}/{sample}/{suffix}.bam.libsize' + count='{sample_dir}/{sample}/{suffix}.bam.libsize', + shell: 'samtools view -c {input} > {output}' @@ -388,7 +399,9 @@ rule fastq_screen: **fastq_screen_references(), fastq=r1_only(rules.cutadapt.output.fastq), output: - txt=c.patterns['fastq_screen'] + txt=report(c.patterns['fastq_screen'], + caption=c.rst_files['fastq_screen'], + category="fastq screen") log: c.patterns['fastq_screen'] + '.log' params: subset=100000 @@ -436,8 +449,12 @@ rule rrna_libsizes_table: rrna=c.targets['rrna']['libsize'], fastq=c.targets['libsizes']['cutadapt'] output: - json=c.patterns['rrna_percentages_yaml'], - tsv=c.patterns['rrna_percentages_table'] + json=report(c.patterns['rrna_percentages_yaml'], + caption=c.rst_files['rrna_percentages_yaml'], + category='Libsizes'), + tsv=report(c.patterns['rrna_percentages_table'], + caption=c.rst_files['rrna_percentages_table'], + category='Libsizes'), run: def rrna_sample(f): return helpers.extract_wildcards(c.patterns['rrna']['libsize'], f)['sample'] @@ -486,8 +503,13 @@ rule libsizes_table: input: utils.flatten(c.targets['libsizes']) output: - json=c.patterns['libsizes_yaml'], - tsv=c.patterns['libsizes_table'] + json=report(c.patterns['libsizes_yaml'], + caption=c.rst_files['libsizes_yaml'], + category='Libsizes'), + tsv=report(c.patterns['libsizes_table'], + caption=c.rst_files['libsizes_table'], + category='Libsizes'), + run: def sample(f): return os.path.basename(os.path.dirname(f)) @@ -554,12 +576,12 @@ rule multiqc: utils.flatten(c.targets['collectrnaseqmetrics']) ), config='config/multiqc_config.yaml' - output: c.targets['multiqc'] - log: c.targets['multiqc'][0] + '.log' + output: report(c.patterns['multiqc'], caption=c.rst_files['multiqc'], category='QC') + log: c.patterns['multiqc'] + '.log' run: analysis_directory = set([os.path.dirname(i) for i in input]) - outdir = os.path.dirname(c.targets['multiqc'][0]) - basename = os.path.basename(c.targets['multiqc'][0]) + outdir = os.path.dirname(c.patterns['multiqc']) + basename = os.path.basename(c.patterns['multiqc']) shell( 'LC_ALL=en_US.UTF.8 LC_LANG=en_US.UTF-8 ' 'multiqc ' @@ -581,7 +603,9 @@ rule markduplicates: bam=c.patterns['bam'] output: bam=c.patterns['markduplicates']['bam'], - metrics=c.patterns['markduplicates']['metrics'] + metrics=c.patterns['markduplicates']['metrics'], +# caption=c.rst_files['markduplicates']['metrics'], +# category='Metrics') log: c.patterns['markduplicates']['bam'] + '.log' params: @@ -608,8 +632,12 @@ rule collectrnaseqmetrics: bam=c.patterns['bam'], refflat=c.refdict[c.organism][config['gtf']['tag']]['refflat'] output: - metrics=c.patterns['collectrnaseqmetrics']['metrics'], - pdf=c.patterns['collectrnaseqmetrics']['pdf'] + metrics=report(c.patterns['collectrnaseqmetrics']['metrics'], + caption=c.rst_files['collectrnaseqmetrics']['metrics'], + category='Metrics'), + pdf=report(c.patterns['collectrnaseqmetrics']['pdf'], + caption=c.rst_files['collectrnaseqmetrics']['pdf'], + category='Metrics'), params: # NOTE: Be careful with the memory here; make sure you have enough # and/or it matches the resources you're requesting in the cluster @@ -647,7 +675,10 @@ rule preseq: input: bam=c.patterns['bam'] output: - c.patterns['preseq'] + report(c.patterns['preseq'], + caption=c.rst_files['preseq'], + category='Metrics'), + shell: 'preseq ' 'c_curve ' @@ -663,14 +694,31 @@ rule dupRadar: bam=rules.markduplicates.output.bam, annotation=c.refdict[c.organism][config['gtf']['tag']]['gtf'], output: - density_scatter=c.patterns['dupradar']['density_scatter'], - expression_histogram=c.patterns['dupradar']['expression_histogram'], - expression_boxplot=c.patterns['dupradar']['expression_boxplot'], - expression_barplot=c.patterns['dupradar']['expression_barplot'], - multimapping_histogram=c.patterns['dupradar']['multimapping_histogram'], - dataframe=c.patterns['dupradar']['dataframe'], - model=c.patterns['dupradar']['model'], - curve=c.patterns['dupradar']['curve'], + density_scatter=report(c.patterns['dupradar']['density_scatter'], + caption=c.rst_files['dupradar']['density_scatter'], + category='dupRadar'), + expression_histogram=report(c.patterns['dupradar']['expression_histogram'], + caption=c.rst_files['dupradar']['expression_histogram'], + category='dupRadar'), + expression_boxplot=report(c.patterns['dupradar']['expression_boxplot'], + caption=c.rst_files['dupradar']['expression_boxplot'], + category='dupRadar'), + expression_barplot=report(c.patterns['dupradar']['expression_barplot'], + caption=c.rst_files['dupradar']['expression_barplot'], + category='dupRadar'), + multimapping_histogram=report(c.patterns['dupradar']['multimapping_histogram'], + caption=c.rst_files['dupradar']['multimapping_histogram'], + category='dupRadar'), + dataframe=report(c.patterns['dupradar']['dataframe'], + caption=c.rst_files['dupradar']['dataframe'], + category='dupRadar'), + model=report(c.patterns['dupradar']['model'], + caption=c.rst_files['dupradar']['model'], + category='dupRadar'), + curve=report(c.patterns['dupradar']['curve'], + caption=c.rst_files['dupradar']['curve'], + category='dupRadar'), + log: c.patterns['dupradar']['dataframe'] + '.log' script: wrapper_for('dupradar/wrapper.py') @@ -684,7 +732,9 @@ rule salmon: fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), index=c.refdict[c.organism][config['salmon']['tag']]['salmon'], output: - c.patterns['salmon'] + report(c.patterns['salmon'], + caption=c.rst_files['salmon'], + category='quantification') params: index_dir=os.path.dirname(c.refdict[c.organism][config['salmon']['tag']]['salmon']), outdir=os.path.dirname(c.patterns['salmon']) @@ -731,6 +781,7 @@ rule salmon: '-r {input.fastq} ' '&> {log}' ) + shell('mv {params.outdir}/quant.sf {output}') rule rseqc_bam_stat: @@ -866,7 +917,9 @@ if 'merged_bigwigs' in config: bigwigs=bigwigs_to_merge, chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], output: - c.patterns['merged_bigwig'] + report(c.patterns['merged_bigwig'], caption=( + c.rst_files['merged_bigwig']), + category='bigwig') log: c.patterns['merged_bigwig'] + '.log' script: diff --git a/workflows/rnaseq/config/rnaseq_patterns.yaml b/workflows/rnaseq/config/rnaseq_patterns.yaml index 7943810e..979de1ac 100644 --- a/workflows/rnaseq/config/rnaseq_patterns.yaml +++ b/workflows/rnaseq/config/rnaseq_patterns.yaml @@ -1,48 +1,145 @@ -fastq: 'data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz' -cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz' -bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam' +fastq: + pattern: 'data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz' + description: 'Raw sequence reads' + +cutadapt: + pattern: 'data/rnaseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz' + description: 'Sequence reads trimmed of adaptors' + +bam: + pattern: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam' + description: 'Binary file containing sequence alignment data' + fastqc: - raw: 'data/rnaseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip' - cutadapt: 'data/rnaseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip' - bam: 'data/rnaseq_samples/{sample}/fastqc/{sample}.cutadapt.bam_fastqc.zip' + raw: + pattern: 'data/rnaseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip' + description: 'Quality control analysis of raw sequence reads' + cutadapt: + pattern: 'data/rnaseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip' + description: 'Quality control analysis of raw sequence reads post adaptor trimming' + bam: + pattern: 'data/rnaseq_samples/{sample}/fastqc/{sample}.cutadapt.bam_fastqc.zip' + description: 'Quality control analysis of aligned reads' + libsizes: - fastq: 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz.libsize' - cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize' - bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.libsize' -fastq_screen: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.screen.txt' -featurecounts: 'data/rnaseq_aggregation/featurecounts.txt' -libsizes_table: 'data/rnaseq_aggregation/libsizes_table.tsv' -libsizes_yaml: 'data/rnaseq_aggregation/libsizes_table_mqc.yaml' -rrna_percentages_table: 'data/rnaseq_aggregation/rrna_percentages_table.tsv' -rrna_percentages_yaml: 'data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml' + fastq: + pattern: 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz.libsize' + description: 'Sample library size using raw sequence reads' + cutadapt: + pattern: 'data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize' + description: 'Sample library size using sequence reads post adaptor trimming' + bam: + pattern: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.libsize' + description: 'Sample library size using aligned reads' + +fastq_screen: + pattern: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.screen.txt' + description: 'fastq screen statistics' + +featurecounts: + pattern: 'data/rnaseq_aggregation/featurecounts.txt' + description: 'Transcript quantification' + +libsizes_table: + pattern: 'data/rnaseq_aggregation/libsizes_table.tsv' + description: 'Aggregated libraries size per chromosome' + +libsizes_yaml: + pattern: 'data/rnaseq_aggregation/libsizes_table_mqc.yaml' + description: 'Config for aggregation of libraries size per chromosome' + +rrna_percentages_table: + pattern: 'data/rnaseq_aggregation/rrna_percentages_table.tsv' + description: 'Aggregated rRNA percentage per library' + +rrna_percentages_yaml: + pattern: 'data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml' + description: 'Config for aggregation of rRNA percentage per library' + rrna: - bam: 'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam' - libsize: 'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize' -multiqc: 'data/rnaseq_aggregation/multiqc.html' + bam: + pattern: 'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam' + description: 'Binary file containing sequence aligning to rRNA' + libsize: + pattern: 'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize' + description: 'Sample library sizes aligning to rRNA' + +multiqc: + pattern: 'data/rnaseq_aggregation/multiqc.html' + description: 'Aggregated Quality Control analysis' + markduplicates: - bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam' - metrics: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.metrics' + bam: + pattern: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam' + description: 'Binary file containing sequence alignment data for duplicate reads' + metrics: + pattern: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.metrics' + description: 'Metrics details of duplicate reads' + collectrnaseqmetrics: - metrics: 'data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics' - pdf: 'data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.pdf' + metrics: + pattern: 'data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics' + description: 'Details of RNAseq metrics' + pdf: + pattern: 'data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.pdf' + description: 'Summary pdf of RNAseq metrics' + dupradar: - density_scatter: 'data/rnaseq_samples/{sample}/dupradar/{sample}_density_scatter.png' - expression_histogram: 'data/rnaseq_samples/{sample}/dupradar/{sample}_expression_histogram.png' - expression_boxplot: 'data/rnaseq_samples/{sample}/dupradar/{sample}_expression_boxplot.png' - expression_barplot: 'data/rnaseq_samples/{sample}/dupradar/{sample}_expression_barplot.png' - multimapping_histogram: 'data/rnaseq_samples/{sample}/dupradar/{sample}_multimapping_histogram.png' - dataframe: 'data/rnaseq_samples/{sample}/dupradar/{sample}_dataframe.tsv' - model: 'data/rnaseq_samples/{sample}/dupradar/{sample}_model.txt' - curve: 'data/rnaseq_samples/{sample}/dupradar/{sample}_curve.txt' -preseq: 'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt' -salmon: 'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf' + density_scatter: + pattern: 'data/rnaseq_samples/{sample}/dupradar/{sample}_density_scatter.png' + description: 'Density scatterplot for Duplication rate quality control' + expression_histogram: + pattern: 'data/rnaseq_samples/{sample}/dupradar/{sample}_expression_histogram.png' + description: 'Expression histogram for Duplication rate quality control' + expression_boxplot: + pattern: 'data/rnaseq_samples/{sample}/dupradar/{sample}_expression_boxplot.png' + description: 'Expression boxplot for Duplication rate quality control' + expression_barplot: + pattern: 'data/rnaseq_samples/{sample}/dupradar/{sample}_expression_barplot.png' + description: 'Expression barplot for Duplication rate quality control' + multimapping_histogram: + pattern: 'data/rnaseq_samples/{sample}/dupradar/{sample}_multimapping_histogram.png' + description: 'Histogram of multimapping for Duplication rate quality control' + dataframe: + pattern: 'data/rnaseq_samples/{sample}/dupradar/{sample}_dataframe.tsv' + description: 'Results table for Duplication rate quality control' + model: + pattern: 'data/rnaseq_samples/{sample}/dupradar/{sample}_model.txt' + description: 'Model used in Duplication rate quality control' + curve: + pattern: 'data/rnaseq_samples/{sample}/dupradar/{sample}_curve.txt' + description: 'Curve for Duplication rate quality control' + +preseq: + pattern: 'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt' + description: 'Preseq' + +salmon: + pattern: 'data/rnaseq_samples/{sample}/{sample}.salmon/{sample}_quant.sf' + description: 'Transcripts quantification using Salmon' + rseqc: - bam_stat: 'data/rnaseq_samples/{sample}/rseqc/{sample}_bam_stat.txt' - infer_experiment: 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt' + bam_stat: + pattern: 'data/rnaseq_samples/{sample}/rseqc/{sample}_bam_stat.txt' + description: 'RNAseq quality control analysis' + infer_experiment: + pattern: 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt' + description: 'Infer layout and strandedness of experiment' + bigwig: - pos: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig' - neg: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig' + pos: + pattern: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig' + description: 'bigwig file for positive strand relative to sequence reads' + neg: + pattern: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig' + description: 'bigwig file for negative strand relative to sequence reads' + downstream: - rnaseq: 'downstream/rnaseq.html' + rnaseq: + pattern: 'downstream/rnaseq.html' + description: 'Results summary' + patterns_by_aggregate: - merged_bigwig: 'data/rnaseq_aggregation/merged_bigwigs/{merged_bigwig_label}.bigwig' + merged_bigwig: + pattern: 'data/rnaseq_aggregation/merged_bigwigs/{merged_bigwig_label}.bigwig' + description: 'Merged sense and antisense bigwigs for unstranded libraries' diff --git a/workflows/rnaseq/downstream/rnaseq.Rmd b/workflows/rnaseq/downstream/rnaseq.Rmd index 4a86ed3b..d565141e 100644 --- a/workflows/rnaseq/downstream/rnaseq.Rmd +++ b/workflows/rnaseq/downstream/rnaseq.Rmd @@ -139,7 +139,7 @@ colData$featurecounts.path <- sapply( # you've changed anything there you will need to change it here as well. colData$salmon.path <- sapply( colData$samplename, - function (x) file.path('..', 'data', 'rnaseq_samples', x, paste0(x, '.salmon'), 'quant.sf') + function (x) file.path('..', 'data', 'rnaseq_samples', x, paste0(x, '.salmon'), paste0(x, '_quant.sf')) ) # NOTE: Factor columns------------------------------------------------------ diff --git a/workflows/rnaseq/workflow.rst b/workflows/rnaseq/workflow.rst new file mode 100644 index 00000000..a7b0d8c0 --- /dev/null +++ b/workflows/rnaseq/workflow.rst @@ -0,0 +1,54 @@ +lcdb-wf is a collection of snakemake workflows and tools for common high-throughput sequencing analysis, along with associated infrastructure. + +See docs at https://lcdb.github.io/lcdb-wf. + +RNASEQ workflow + +This workflow is used for RNA-seq and RNA-seq-like analysis (like euRNA-seq, RIP-seq or small RNA-seq). + +This workflow can use references created by the references workflow with no need to run the references workflow separately. This workflow performs the following tasks: + +- Build a HISAT2 index +- Build a salmon transcriptome index +- Download a GTF annotation +- Convert the GTF to refflat format +- Trim reads with cutadapt +- Align with HISAT2 +- Run FastQC on raw, trimmed, and aligned reads +- Align reads to rRNA using bowtie2 to evaluate rRNA contamination +- Count reads in genes with featureCounts +- Run dupRadar and preseq to assess library complexity +- Check for evidence of cross-contamination using fastq_screen on multiple configured genomes +- Assess transcript coverage with Picard CollectRnaSeqMetrics +- Build bigWigs (optionally strand-specific) created from BAM files +- Optionally merge bigWigs as defined by config +- Aggregate QC results using MultiQC. Includes custom tables for library sizes and rRNA contamination +- Run various QC and differential expression. This is performed in an RMarkdown file that runs a standard DESeq2 differential expression analysis along with diagnostic plots, exported tables of differentially expressed genes for each contrast, and downstream GO analysis using clusterProfiler. This file is run and rendered into an output HTML file. +- Construct and upload a track hub of scaled coverage bigWigs for each sample that can be viewed in UCSC Genome Browser + + +Configurations used: + +{% for items in snakemake.config %} + {% if items != 'references'%} + - {{ items }} : {{snakemake.config[items]}} + {% elif items == 'references' %} + - References: + + {% for sublista in snakemake.config[items] %} + - {{sublista}} : + + {% for sublistb in snakemake.config[items][sublista] %} + - {{ sublistb }} : + + {% for sublistc in snakemake.config[items][sublista][sublistb] %} + + - {{sublistc}} : {{snakemake.config[items][sublista][sublistb][sublistc]}} + + {% endfor %} + {% endfor %} + {% endfor %} + {% endif %} +{% endfor %} + +