lcdb · esnaultc · Aug 23, 2019 · Aug 26, 2019 · Sep 13, 2019 · Sep 14, 2019
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -74,6 +74,7 @@ variables:
           cd workflows/chipseq
           source activate lcdb-wf-test
           ./run_test.sh --use-conda -j2 -k -p -r
+          ./run_test.sh --report
           python chipseq_trackhub.py config/config.yaml config/hub_config.yaml
 
   chipseq-regression-step: &chipseq-regression-step
@@ -103,6 +104,7 @@ variables:
           cd workflows/rnaseq
           source activate lcdb-wf-test
           ./run_test.sh --use-conda -j2 -k -p -r
+          ./run_test.sh --report
           python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml
 
   rnaseq-star-step: &rnaseq-star-step

diff --git a/include/WRAPPER_SLURM b/include/WRAPPER_SLURM
@@ -21,6 +21,7 @@ if [[ ! -e logs ]]; then mkdir -p logs; fi
     --use-conda \
     --configfile config/config.yaml \
     --latency-wait=300 \
+    --report report.html
     ) > "Snakefile.log" 2>&1
 
 SNAKE_PID=$!

diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py
@@ -9,6 +9,7 @@
 from . import common
 from . import chipseq
 from . import helpers
+from . import utils
 
 HERE = os.path.abspath(os.path.dirname(__file__))
 
@@ -61,8 +62,86 @@ def __init__(self, config, patterns, workdir=None):
         self.samples, self.sampletable = common.get_sampletable(self.config)
         self.refdict, self.conversion_kwargs = common.references_dict(self.config)
         self.organism = self.config['organism']
-        self.patterns = yaml.load(open(patterns), Loader=yaml.FullLoader)
-        self.is_paired = helpers.detect_layout(self.sampletable) == 'PE'
+
+        # Patterns have one of three structures:
+        #
+        # Structure 1 (simple):
+        #
+        #    salmon:
+        #      pattern: 'data/rnaseq_samples/{sample}/{sample}.salmon/{sample}_quant.sf'
+        #      description: 'Transcripts quantification using Salmon'
+        #
+        # Structure 2 (nested 1 deep):
+        #
+        #    rseqc:
+        #      bam_stat:
+        #        pattern: 'data/rnaseq_samples/{sample}/rseqc/{sample}_bam_stat.txt'
+        #        description: 'RNAseq quality control analysis'
+        #      infer_experiment:
+        #        pattern: 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt'
+        #        description: 'Infer layout and strandedness of experiment'
+        #
+        #
+        # Structure 3 (nested 2 deep):
+        #
+        #    patterns_by_peaks:
+        #      peaks:
+        #        macs2:
+        #          pattern: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed'
+        #          description: 'BED file of peaks from macs2 peak-caller'
+        #
+        #
+        # Our job here is to create three objects:
+        #
+        # patterns = {
+        #   'salmon': 'data/rnaseq_samples/{sample}/{sample}.salmon/{sample}_quant.sf',
+        #   'rseqc': {
+        #       'bam_stat': 'data/rnaseq_samples/{sample}/rseqc/{sample}_bam_stat.txt',
+        #       'infer_experiment': 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt',
+        #   },
+        #   'patterns_by_peaks': {
+        #     'peaks': {
+        #       'macs2': 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed'
+        #     }
+        #   }
+        # }
+        #
+        # rst_files = {
+        #   'salmon': 'reports/data/rnaseq_samples/sample/sample.salmon/sample_quant.sf.rst',
+        #   'rseqc': {
+        #       'bam_stat': 'reports/data/rnaseq_samples/sample/rseqc/sample_bam_stat.txt',
+        #       'infer_experiment': 'reports/data/rnaseq_samples/sample/rseqc/sample_infer_experiment.txt.rst',
+        #   },
+        #   'patterns_by_peaks': {
+        #     'peaks': {
+        #       'macs2': 'reports/data/chipseq_peaks/macs2/macs2_run/peaks.bed.rst'
+        #     }
+        # }
+        #
+        # descriptions = {
+        #   'salmon': 'Transcripts quantification using Salmon'
+        #   'rseqc': {
+        #       'bam_stat': 'RNAseq quality control analysis',
+        #       'infer_experiment': 'Infer layout and strandedness of experiment',
+        #   },
+        #   'patterns_by_peaks': {
+        #     'peaks': {
+        #       'macs2': 'BED file of peaks from macs2 peak-caller'
+        #     }
+        #   }
+        # }
+        #
+        #
+
+
+        loaded_patterns = yaml.load(open(patterns), Loader=yaml.FullLoader)
+
+        self._loaded_patterns = loaded_patterns
+        self.patterns = utils.extract_nested(loaded_patterns, "pattern")
+        self.descriptions = utils.extract_nested(loaded_patterns, "description")
+        self.rst_files = utils.map_nested_dicts(self.patterns, utils.pattern_to_rst_file)
+
+        self.is_paired = helpers.detect_layout(self.sampletable) == "PE"
         if self.is_paired:
             self.n = [1, 2]
         else:
@@ -92,9 +171,14 @@ def __init__(self, config, patterns, workdir=None):
         SeqConfig.__init__(self, config, patterns, workdir)
 
         self.fill = dict(sample=self.samples, n=self.n)
+
+        # The merged bigwigs have different placeholders and therefore must be
+        # filled separately. They also only should be included if
+        # merged_bigwigs have been configured.
         self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None)
         self.targets = helpers.fill_patterns(self.patterns, self.fill, zip)
 
+
         # Then the aggregation
         if self.patterns_by_aggregation is not None and 'merged_bigwigs' in self.config:
             self.fill_by_aggregation = dict(
@@ -107,6 +191,9 @@ def __init__(self, config, patterns, workdir=None):
             self.targets.update(self.targets_by_aggregation)
             self.patterns.update(self.patterns_by_aggregation)
 
+            self.rst_files.update(self.rst_files.pop('patterns_by_aggregate'))
+            self.descriptions.update(self.descriptions.pop('patterns_by_aggregate'))
+
 
 class ChIPSeqConfig(SeqConfig):
     def __init__(self, config, patterns, workdir=None):
@@ -156,6 +243,8 @@ def __init__(self, config, patterns, workdir=None):
 
         self.targets.update(self.targets_by_sample)
         self.patterns.update(self.patterns_by_sample)
+        self.descriptions.update(self.descriptions['patterns_by_sample'])
+        self.rst_files.update(self.rst_files['patterns_by_sample'])
 
         # Then the aggregation...
         self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None)
@@ -170,6 +259,9 @@ def __init__(self, config, patterns, workdir=None):
             self.targets.update(self.targets_by_aggregation)
             self.patterns.update(self.patterns_by_aggregation)
 
+            self.rst_files.update(self.rst_files.pop('patterns_by_aggregate'))
+            self.descriptions.update(self.descriptions.pop('patterns_by_aggregate'))
+
         # Then the peaks...
         #
         # Note: when adding support for new peak callers, add them here.
@@ -225,3 +317,5 @@ def __init__(self, config, patterns, workdir=None):
 
         self.targets.update(self.targets_for_peaks)
         self.patterns.update(self.patterns_by_peaks)
+        self.descriptions.update(self.descriptions['patterns_by_peaks'])
+        self.rst_files.update(self.rst_files['patterns_by_peaks'])
diff --git a/lib/utils.py b/lib/utils.py
@@ -1,8 +1,21 @@
 import os
 import contextlib
 import collections
-from collections.abc import Iterable
+from collections.abc import Iterable, Mapping
 from snakemake.shell import shell
+from snakemake.io import expand
+
+
+def wrapper_for(path):
+    return 'file:' + os.path.join('../..','wrappers', 'wrappers', path)
+
+
+def render_r1_r2(pattern):
+    return expand(pattern, sample='{sample}', n=[1,2])
+
+
+def r1_only(pattern):
+    return expand(pattern, sample='{sample}', n=1)
 
 
 @contextlib.contextmanager
@@ -50,24 +63,130 @@ def gen():
     return results
 
 
-def test_flatten():
-    assert sorted(flatten({
-        'a': {
-            'b': {
-                'c': ['a', 'b', 'c'],
-            },
-        },
-        'x': ['e', 'f', 'g'],
-        'y': {
-            'z': 'd'
-        },
-    })) == ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+def map_nested_dicts(d, func, stop_condition=None):
+    """
+    Apply `func` to all values of a nested dictionary
 
-    assert flatten('a', True) == 'a'
-    assert flatten(['a'], True) == 'a'
-    assert flatten('a') == ['a']
-    assert flatten(['a']) == ['a']
+    Parameters
+    ----------
+    d : dict
+
+    func : callable
+        Function to apply to values of d, or, if `stop_condition` is provided,
+        function to apply to remainder when `stop_condition(d)` is True.
+
+    stop_condition : callable
+        Mechanism for stopping recursion at a particular level and sending the
+        results at that point to `func`. Function should accept a dict as its
+        only input argument.
+
+    Examples
+    --------
+
+    Convert leaf values into boolean indicating if they are less than three:
+
+    >>> d = {'a': {'b': {'target': 1, 'nontarget': 2}}, 'c': {'target': 3}}
+    >>> res = map_nested_dicts(d, lambda x: x < 3)
+    >>> assert res == {'a': {'b': {'target': True, 'nontarget': True}}, 'c': {'target': False}}
+
+
+    This function will sum values of provided dictionaries
+
+    >>> def sum_values(x):
+    ...     if isinstance(x, Mapping):
+    ...         return sum(x.values())
 
+    Since we don't specify a stopping condition which would send a dict to
+    `sum_values`, only the leaf integers get sent and the `sum_values` will
+    return None for those:
+
+    >>> res = map_nested_dicts(d, sum_values)
+    >>> assert res == {'a': {'b': {'target': None, 'nontarget': None}}, 'c': {'target': None}}, res
+
+    Here the stopping condition is whether "target" is in the keys, and if so,
+    the dict for which that is true is sent to `sum_values`:
+
+
+    >>> def stop1(x):
+    ...     return isinstance(x, Mapping) and 'target' in x.keys()
+
+    >>> res = map_nested_dicts(d, sum_values, stop_condition=stop1)
+    >>> assert res == {'a': {'b': 3}, 'c': 3}, res
+
+
+    Now if we only send dicts with "nontarget" in the keys, values in `b` are
+    summed but values in `c` are not because nothing there satisfied the
+    stopping condition:
+
+    >>> def stop2(x):
+    ...     return isinstance(x, Mapping) and 'nontarget' in x.keys()
+
+    >>> res = map_nested_dicts(d, sum_values, stop_condition=stop2)
+    >>> assert res == {'a': {'b': 3}, 'c': {'target': None}}, res
+
+    """
+    if stop_condition and stop_condition(d):
+        return func(d)
+    if isinstance(d, Mapping):
+        return {k: map_nested_dicts(v, func, stop_condition) for k, v in d.items()}
+    else:
+        return func(d)
+
+
+
+def extract_nested(d, key):
+    """
+    From a nested dict, keep all nesting the same EXCEPT for the leaf dicts,
+    from which only the provided key will be returned.
+
+    Parameters
+    ----------
+    d : dict
+
+    key : str or hashable type
+        Key to extract. Effectively collapses leaf dictionaries containing this
+        key into just the value.
+
+    Examples
+    --------
+
+    >>> d = {'a': {'b': {'target': 1, 'ignore': 2}}, 'c': {'target': 3}}
+    >>> result = extract_nested(d, 'target')
+    >>> assert result == {'a': {'b': 1}, 'c': 3}, result
+    """
+    if not isinstance(d, Mapping):
+        return d
+    if key in d:
+        return d[key]
+    else:
+        return {k: extract_nested(v, key) for k,v in d.items()}
+
+
+def pattern_to_rst_file(p):
+    """
+    Convert filename pattern containing wildcards into an RST filename
+    """
+    return os.path.join("reports", p.replace("{", "").replace("}", "")) + ".rst"
+
+
+def write_out_rsts(full_patterns):
+    """
+    Given the full patterns dictionary (containing patterns and descriptions),
+    write out a corresponding rst file containing the contents of the
+    description.
+
+    Returns None; the side effect is to create all the necessary rst files.
+    """
+    def stop_condition(x):
+        return isinstance(x, Mapping) and 'pattern' in x and 'description' in x
+
+    def writer(x):
+        rst = pattern_to_rst_file(x['pattern'])
+        desc = x['description']
+        with open(rst, 'w') as fout:
+            fout.write(desc + '\n')
+
+    map_nested_dicts(full_patterns, func=writer, stop_condition=stop_condition)
 
 def updatecopy(orig, update_with, keys=None, override=False):
     """
@@ -186,3 +305,22 @@ def make_relative_symlink(target, linkname):
     if not os.path.exists(linkdir):
         shell('mkdir -p {linkdir}')
     shell('cd {linkdir}; ln -sf {relative_target} {linkbase}')
+
+
+def test_flatten():
+    assert sorted(flatten({
+        'a': {
+            'b': {
+                'c': ['a', 'b', 'c'],
+            },
+        },
+        'x': ['e', 'f', 'g'],
+        'y': {
+            'z': 'd'
+        },
+    })) == ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+
+    assert flatten('a', True) == 'a'
+    assert flatten(['a'], True) == 'a'
+    assert flatten('a') == ['a']
+    assert flatten(['a']) == ['a']