Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] add reports to rnaseq-wf #210

Draft
wants to merge 17 commits into
base: v1.5rc
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ variables:
cd workflows/chipseq
source activate lcdb-wf-test
./run_test.sh --use-conda -j2 -k -p -r
./run_test.sh --report
python chipseq_trackhub.py config/config.yaml config/hub_config.yaml

chipseq-regression-step: &chipseq-regression-step
Expand Down Expand Up @@ -103,6 +104,7 @@ variables:
cd workflows/rnaseq
source activate lcdb-wf-test
./run_test.sh --use-conda -j2 -k -p -r
./run_test.sh --report
python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml

rnaseq-star-step: &rnaseq-star-step
Expand Down
1 change: 1 addition & 0 deletions include/WRAPPER_SLURM
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ if [[ ! -e logs ]]; then mkdir -p logs; fi
--use-conda \
--configfile config/config.yaml \
--latency-wait=300 \
--report report.html
) > "Snakefile.log" 2>&1

SNAKE_PID=$!
Expand Down
98 changes: 96 additions & 2 deletions lib/patterns_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from . import common
from . import chipseq
from . import helpers
from . import utils

HERE = os.path.abspath(os.path.dirname(__file__))

Expand Down Expand Up @@ -61,8 +62,86 @@ def __init__(self, config, patterns, workdir=None):
self.samples, self.sampletable = common.get_sampletable(self.config)
self.refdict, self.conversion_kwargs = common.references_dict(self.config)
self.organism = self.config['organism']
self.patterns = yaml.load(open(patterns), Loader=yaml.FullLoader)
self.is_paired = helpers.detect_layout(self.sampletable) == 'PE'

# Patterns have one of three structures:
#
# Structure 1 (simple):
#
# salmon:
# pattern: 'data/rnaseq_samples/{sample}/{sample}.salmon/{sample}_quant.sf'
# description: 'Transcripts quantification using Salmon'
#
# Structure 2 (nested 1 deep):
#
# rseqc:
# bam_stat:
# pattern: 'data/rnaseq_samples/{sample}/rseqc/{sample}_bam_stat.txt'
# description: 'RNAseq quality control analysis'
# infer_experiment:
# pattern: 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt'
# description: 'Infer layout and strandedness of experiment'
#
#
# Structure 3 (nested 2 deep):
#
# patterns_by_peaks:
# peaks:
# macs2:
# pattern: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed'
# description: 'BED file of peaks from macs2 peak-caller'
#
#
# Our job here is to create three objects:
#
# patterns = {
# 'salmon': 'data/rnaseq_samples/{sample}/{sample}.salmon/{sample}_quant.sf',
# 'rseqc': {
# 'bam_stat': 'data/rnaseq_samples/{sample}/rseqc/{sample}_bam_stat.txt',
# 'infer_experiment': 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt',
# },
# 'patterns_by_peaks': {
# 'peaks': {
# 'macs2': 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed'
# }
# }
# }
#
# rst_files = {
# 'salmon': 'reports/data/rnaseq_samples/sample/sample.salmon/sample_quant.sf.rst',
# 'rseqc': {
# 'bam_stat': 'reports/data/rnaseq_samples/sample/rseqc/sample_bam_stat.txt',
# 'infer_experiment': 'reports/data/rnaseq_samples/sample/rseqc/sample_infer_experiment.txt.rst',
# },
# 'patterns_by_peaks': {
# 'peaks': {
# 'macs2': 'reports/data/chipseq_peaks/macs2/macs2_run/peaks.bed.rst'
# }
# }
#
# descriptions = {
# 'salmon': 'Transcripts quantification using Salmon'
# 'rseqc': {
# 'bam_stat': 'RNAseq quality control analysis',
# 'infer_experiment': 'Infer layout and strandedness of experiment',
# },
# 'patterns_by_peaks': {
# 'peaks': {
# 'macs2': 'BED file of peaks from macs2 peak-caller'
# }
# }
# }
#
#


loaded_patterns = yaml.load(open(patterns), Loader=yaml.FullLoader)

self._loaded_patterns = loaded_patterns
self.patterns = utils.extract_nested(loaded_patterns, "pattern")
self.descriptions = utils.extract_nested(loaded_patterns, "description")
self.rst_files = utils.map_nested_dicts(self.patterns, utils.pattern_to_rst_file)

self.is_paired = helpers.detect_layout(self.sampletable) == "PE"
if self.is_paired:
self.n = [1, 2]
else:
Expand Down Expand Up @@ -92,9 +171,14 @@ def __init__(self, config, patterns, workdir=None):
SeqConfig.__init__(self, config, patterns, workdir)

self.fill = dict(sample=self.samples, n=self.n)

# The merged bigwigs have different placeholders and therefore must be
# filled separately. They also only should be included if
# merged_bigwigs have been configured.
self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None)
self.targets = helpers.fill_patterns(self.patterns, self.fill, zip)


# Then the aggregation
if self.patterns_by_aggregation is not None and 'merged_bigwigs' in self.config:
self.fill_by_aggregation = dict(
Expand All @@ -107,6 +191,9 @@ def __init__(self, config, patterns, workdir=None):
self.targets.update(self.targets_by_aggregation)
self.patterns.update(self.patterns_by_aggregation)

self.rst_files.update(self.rst_files.pop('patterns_by_aggregate'))
self.descriptions.update(self.descriptions.pop('patterns_by_aggregate'))


class ChIPSeqConfig(SeqConfig):
def __init__(self, config, patterns, workdir=None):
Expand Down Expand Up @@ -156,6 +243,8 @@ def __init__(self, config, patterns, workdir=None):

self.targets.update(self.targets_by_sample)
self.patterns.update(self.patterns_by_sample)
self.descriptions.update(self.descriptions['patterns_by_sample'])
self.rst_files.update(self.rst_files['patterns_by_sample'])

# Then the aggregation...
self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None)
Expand All @@ -170,6 +259,9 @@ def __init__(self, config, patterns, workdir=None):
self.targets.update(self.targets_by_aggregation)
self.patterns.update(self.patterns_by_aggregation)

self.rst_files.update(self.rst_files.pop('patterns_by_aggregate'))
self.descriptions.update(self.descriptions.pop('patterns_by_aggregate'))

# Then the peaks...
#
# Note: when adding support for new peak callers, add them here.
Expand Down Expand Up @@ -225,3 +317,5 @@ def __init__(self, config, patterns, workdir=None):

self.targets.update(self.targets_for_peaks)
self.patterns.update(self.patterns_by_peaks)
self.descriptions.update(self.descriptions['patterns_by_peaks'])
self.rst_files.update(self.rst_files['patterns_by_peaks'])
172 changes: 155 additions & 17 deletions lib/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,21 @@
import os
import contextlib
import collections
from collections.abc import Iterable
from collections.abc import Iterable, Mapping
from snakemake.shell import shell
from snakemake.io import expand


def wrapper_for(path):
return 'file:' + os.path.join('../..','wrappers', 'wrappers', path)


def render_r1_r2(pattern):
return expand(pattern, sample='{sample}', n=[1,2])


def r1_only(pattern):
return expand(pattern, sample='{sample}', n=1)


@contextlib.contextmanager
Expand Down Expand Up @@ -50,24 +63,130 @@ def gen():
return results


def test_flatten():
assert sorted(flatten({
'a': {
'b': {
'c': ['a', 'b', 'c'],
},
},
'x': ['e', 'f', 'g'],
'y': {
'z': 'd'
},
})) == ['a', 'b', 'c', 'd', 'e', 'f', 'g']
def map_nested_dicts(d, func, stop_condition=None):
"""
Apply `func` to all values of a nested dictionary

assert flatten('a', True) == 'a'
assert flatten(['a'], True) == 'a'
assert flatten('a') == ['a']
assert flatten(['a']) == ['a']
Parameters
----------
d : dict

func : callable
Function to apply to values of d, or, if `stop_condition` is provided,
function to apply to remainder when `stop_condition(d)` is True.

stop_condition : callable
Mechanism for stopping recursion at a particular level and sending the
results at that point to `func`. Function should accept a dict as its
only input argument.

Examples
--------

Convert leaf values into boolean indicating if they are less than three:

>>> d = {'a': {'b': {'target': 1, 'nontarget': 2}}, 'c': {'target': 3}}
>>> res = map_nested_dicts(d, lambda x: x < 3)
>>> assert res == {'a': {'b': {'target': True, 'nontarget': True}}, 'c': {'target': False}}


This function will sum values of provided dictionaries

>>> def sum_values(x):
... if isinstance(x, Mapping):
... return sum(x.values())

Since we don't specify a stopping condition which would send a dict to
`sum_values`, only the leaf integers get sent and the `sum_values` will
return None for those:

>>> res = map_nested_dicts(d, sum_values)
>>> assert res == {'a': {'b': {'target': None, 'nontarget': None}}, 'c': {'target': None}}, res

Here the stopping condition is whether "target" is in the keys, and if so,
the dict for which that is true is sent to `sum_values`:


>>> def stop1(x):
... return isinstance(x, Mapping) and 'target' in x.keys()

>>> res = map_nested_dicts(d, sum_values, stop_condition=stop1)
>>> assert res == {'a': {'b': 3}, 'c': 3}, res


Now if we only send dicts with "nontarget" in the keys, values in `b` are
summed but values in `c` are not because nothing there satisfied the
stopping condition:

>>> def stop2(x):
... return isinstance(x, Mapping) and 'nontarget' in x.keys()

>>> res = map_nested_dicts(d, sum_values, stop_condition=stop2)
>>> assert res == {'a': {'b': 3}, 'c': {'target': None}}, res

"""
if stop_condition and stop_condition(d):
return func(d)
if isinstance(d, Mapping):
return {k: map_nested_dicts(v, func, stop_condition) for k, v in d.items()}
else:
return func(d)



def extract_nested(d, key):
"""
From a nested dict, keep all nesting the same EXCEPT for the leaf dicts,
from which only the provided key will be returned.

Parameters
----------
d : dict

key : str or hashable type
Key to extract. Effectively collapses leaf dictionaries containing this
key into just the value.

Examples
--------

>>> d = {'a': {'b': {'target': 1, 'ignore': 2}}, 'c': {'target': 3}}
>>> result = extract_nested(d, 'target')
>>> assert result == {'a': {'b': 1}, 'c': 3}, result
"""
if not isinstance(d, Mapping):
return d
if key in d:
return d[key]
else:
return {k: extract_nested(v, key) for k,v in d.items()}


def pattern_to_rst_file(p):
"""
Convert filename pattern containing wildcards into an RST filename
"""
return os.path.join("reports", p.replace("{", "").replace("}", "")) + ".rst"


def write_out_rsts(full_patterns):
"""
Given the full patterns dictionary (containing patterns and descriptions),
write out a corresponding rst file containing the contents of the
description.

Returns None; the side effect is to create all the necessary rst files.
"""
def stop_condition(x):
return isinstance(x, Mapping) and 'pattern' in x and 'description' in x

def writer(x):
rst = pattern_to_rst_file(x['pattern'])
desc = x['description']
with open(rst, 'w') as fout:
fout.write(desc + '\n')

map_nested_dicts(full_patterns, func=writer, stop_condition=stop_condition)

def updatecopy(orig, update_with, keys=None, override=False):
"""
Expand Down Expand Up @@ -186,3 +305,22 @@ def make_relative_symlink(target, linkname):
if not os.path.exists(linkdir):
shell('mkdir -p {linkdir}')
shell('cd {linkdir}; ln -sf {relative_target} {linkbase}')


def test_flatten():
assert sorted(flatten({
'a': {
'b': {
'c': ['a', 'b', 'c'],
},
},
'x': ['e', 'f', 'g'],
'y': {
'z': 'd'
},
})) == ['a', 'b', 'c', 'd', 'e', 'f', 'g']

assert flatten('a', True) == 'a'
assert flatten(['a'], True) == 'a'
assert flatten('a') == ['a']
assert flatten(['a']) == ['a']
Loading