diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/CD.yml similarity index 73% rename from .github/workflows/publish-to-test-pypi.yml rename to .github/workflows/CD.yml index 456a3ec3..0f1d8ac3 100644 --- a/.github/workflows/publish-to-test-pypi.yml +++ b/.github/workflows/CD.yml @@ -1,21 +1,21 @@ -name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI +name: CapCruncher CD -on: +on: push: branches: - master - develop - + jobs: build-n-publish: name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@master - - name: Set up Python 3.9 + - name: Set up Python 3.10 uses: actions/setup-python@v1 with: - python-version: 3.9 + python-version: "3.10" - name: Install pypa/build run: >- python -m @@ -31,7 +31,7 @@ jobs: --outdir dist/ . - name: Publish distribution 📦 to Test PyPI - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@v1.8.8 with: password: ${{ secrets.TEST_PYPI_API_TOKEN }} repository_url: https://test.pypi.org/legacy/ @@ -39,6 +39,6 @@ jobs: - name: Publish distribution 📦 to PyPI if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@v1.8.8 with: - password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 00000000..d55cd88a --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,86 @@ +name: CapCruncher CI + +on: [push] + +env: + CACHE_NUMBER: 1 + +jobs: + install-and-test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install OS dependencies + run: | + sudo apt-get update + sudo apt-get install libcurl4-openssl-dev + + - name: Restore bowtie2 cache + uses: actions/cache@v2 + with: + path: tests/data/data_for_pipeline_run/chr14_bowtie2_indicies/*.bt2 + key: ${{ env.CACHE_NUMBER }} + id: bowtie2_cache + + - name: Get Date + id: get-date + run: echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT + shell: bash + + - name: Setup Mambaforge + uses: conda-incubator/setup-miniconda@v2 + with: + miniforge-variant: Mambaforge + miniforge-version: latest + activate-environment: test + use-mamba: true + + - name: Cache Conda env + uses: actions/cache@v2 + with: + path: ${{ env.CONDA }}/envs + key: conda-${{ runner.os }}--${{ runner.arch }}--${{ + steps.get-date.outputs.today }}-${{ + hashFiles('environment.yml') }}-${{ env.CACHE_NUMBER}} + env: + # Increase this value to reset cache if etc/example-environment.yml has not changed + CACHE_NUMBER: 0 + id: cache-conda-env + + - name: Update environment + run: mamba env update -n test -f environment.yml + if: steps.cache-conda-env.outputs.cache-hit != 'true' + + - name: Check installed packages + shell: bash -el {0} + run: | + mamba info + + - name: Install the package + shell: bash -el {0} + run: | + pip install .[stats,plotting,experimental] + + - name: Test with pytest and generate report + shell: bash -el {0} + run: | + pip install pytest-cov pytest-lazy-fixture pytest-order pytest-xdist + pytest -vv -s --log-cli-level info --cov=./ --cov-report=xml --cores 2 + + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v3 + with: + env_vars: OS,PYTHON + fail_ci_if_error: true + files: ./coverage.xml + verbose: true diff --git a/.github/workflows/python-template.yml b/.github/workflows/python-template.yml deleted file mode 100644 index ce948cdb..00000000 --- a/.github/workflows/python-template.yml +++ /dev/null @@ -1,77 +0,0 @@ -name: Python package - -on: [push] - -env: - CACHE_NUMBER: 0 - -jobs: - build: - - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [3.8] - - steps: - - - uses: actions/checkout@v2 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install OS dependencies - run: | - sudo apt-get update - sudo apt-get install libcurl4-openssl-dev - - - name: Lint with flake8 - run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - - name: Restore bowtie2 cache - uses: actions/cache@v2 - with: - path: tests/data/data_for_pipeline_run/chr14_bowtie2_indicies/ - key: ${{ hashFiles('tests/data/data_for_pipeline_run/chr14.fa.gz') }}-${{ env.CACHE_NUMBER }} - id: bowtie2_cache - - - name: Set up conda using miniforge - uses: conda-incubator/setup-miniconda@v2 - with: - environment-file: environment.yml - miniforge-version: latest - miniforge-variant: Mambaforge - python-version: ${{ matrix.python-version }} - use-mamba: true - - - name: Install testing dependencies - shell: bash -l {0} - run: | - mamba env update -n test -f testing.yml - - - name: Check installed packages - shell: bash -l {0} - run: | - mamba info - - - name: Install the package - shell: bash -l {0} - run: | - pip install .[plotting] - - - name: Test with pytest and generate report - shell: bash -l {0} - run: | - pytest -vv -s --log-cli-level debug --cov=./ --cov-report=xml - - - name: Upload Coverage to Codecov - uses: codecov/codecov-action@v1 - with: - verbose: true # optional (default = false) diff --git a/.gitignore b/.gitignore index a667e2fe..ea6ad319 100644 --- a/.gitignore +++ b/.gitignore @@ -1,15 +1,23 @@ -*.pyc -*.json +_build/ +.DS_Store +.env +.ipynb_checkpoints +.pytest_cache .vscode +*.bed.bgz +*.bt2 +*.ipynb +*.pyc **.vscode -.pytest_cache -.env -docs/_build/ -dist/ build/ capcruncher.egg-info/ +dist/ +docs/_build/ pkgs/ -_build/ -.DS_Store -*.bt2 -tests/data/data_for_pipeline_run/mm9_chr14_genes.bed.bgz +*.tar.gz +*.bed.bgz +_version.py +*.log +cliff.toml +.coverage.* +coverage.xml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..214d336d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,36 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 # Use the ref you want to point at + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-case-conflict + - id: check-merge-conflict + - id: check-symlinks + - id: check-json + - id: debug-statements + - id: check-yaml + - id: check-added-large-files + + - repo: https://github.com/charliermarsh/ruff-pre-commit + # Ruff version. + rev: 'v0.0.206' + hooks: + - id: ruff + args: ["--force-exclude", "--ignore", "E501", "--ignore", "E402", "--ignore", "F401", "--fix"] + + - repo: https://github.com/psf/black + rev: 22.12.0 + hooks: + - id: black + # It is recommended to specify the latest version of Python + # supported by your project here, or alternatively use + # pre-commit's default_language_version, see + # https://pre-commit.com/#top_level-default_language_version + language_version: python3.10 + + - repo: https://github.com/snakemake/snakefmt + rev: v0.8.0 # Replace by any tag/version ≥0.2.4 : https://github.com/snakemake/snakefmt/releases + hooks: + - id: snakefmt + args: [] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 2500911f..b6d9e9e7 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -20,4 +20,3 @@ python: - requirements: docs/requirements.txt - method: pip path: . - diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index de75a469..00000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,99 +0,0 @@ -# Changelog - -All notable changes to this project will be documented in this file. - -## [0.2.3] - 2022-08-19 - -### Bug Fixes - -- Fixes Tiled-C filtering error due to typo in index specification for remove_dual_capture_fragments -- Fixed bug when annotating Tiled-C data ([#166](https://github.com/sims-lab/CapCruncher/pull/166)) with the sorted option that caused no data to be annotated as as a viewpoint or exclusion. -- Fixed a bug with Tiled-C slice filtering ([#166](https://github.com/sims-lab/CapCruncher/pull/166)) that caused slices to be erronously filtered. -- Fixed a bug with counting reporters in batches (this occurs when counting >1x106 slices per viewpoint) ([#166](https://github.com/sims-lab/CapCruncher/pull/166)) -- Fixed a bug when merging and storing Tri-C or Tiled-C data ([#166](https://github.com/sims-lab/CapCruncher/pull/166)) using "capcruncher reporters store merge" or "capcruncher reporters store bins". This functionally has been re-written and now appears to work correctly. -- Fixed a bug with plotting matrices using capcrunchers plotting capabilities ([#166](https://github.com/sims-lab/CapCruncher/pull/166)). -- Fixes bug where all slices are removed from parquet files (reporter) outputs due to an upgraded dask version (the pyarrow-dataset engine has been removed). This corrects the all dataframes are empty error occurring while generating reporter statistics. - -### Features - -- Added option to normalised CCMatrix data using ice followed by a scaling factor. -- Reporter merging (paraquet) has been re-written to use pyarrow directly and is now faster and better able to split datasets into smaller files for more efficient parallel querying. - - -### Miscellaneous Tasks - -- Updated version to 0.2.3 ([#165](https://github.com/sims-lab/CapCruncher/pull/165)) -- Pin conda dependency versions to speed up environment solver ([#167](https://github.com/sims-lab/CapCruncher/pull/167)) - -## [0.2.2] - 2022-03-03 - -### Bug Fixes - -- Fixes error when generating a report for a single sample([#149](https://github.com/sims-lab/CapCruncher/pull/149)) -- Prevent cluster errors due to lack of allocated memory. ([#153](https://github.com/sims-lab/CapCruncher/pull/153)) -- Enabled max memory adjustment for alignment filtering ([#156](https://github.com/sims-lab/CapCruncher/pull/156)) -- Fix dask client shutdown related errors ([#154](https://github.com/sims-lab/CapCruncher/pull/154)) -- Fix issue with missing report text file ([#157](https://github.com/sims-lab/CapCruncher/pull/157)) - -### Features - -- Added CLI utility ti get viewpoint coordinates from capture oligos ([#144](https://github.com/sims-lab/CapCruncher/pull/144)) -- Enables split directly to gzip compressed fastq to reduce the disk space required ([#146](https://github.com/sims-lab/CapCruncher/pull/146)) -- Improve utility of the reporters-compare CLI module ([#145](https://github.com/sims-lab/CapCruncher/pull/145)) -- Reduced disk space required by fastq files ([#150](https://github.com/sims-lab/CapCruncher/pull/150)) -- Enabled biasing of duplicate removal towards trans reporters ([#151](https://github.com/sims-lab/CapCruncher/pull/151)) - -### Miscellaneous Tasks - -- Cache conda env to speed up workflow ([#143](https://github.com/sims-lab/CapCruncher/pull/143)) -- Publish development builds (test-pypi) and releases (pypi) ([#152](https://github.com/sims-lab/CapCruncher/pull/152)) -- Pre-release checks ([#159](https://github.com/sims-lab/CapCruncher/pull/159)) -- Prepare for release 0.2.2 - -### Build - -- Add issue and feature request templates ([#147](https://github.com/sims-lab/CapCruncher/pull/147)) - -## [0.2.0] - 2022-02-08 - -### Bug Fixes - -- Fixed packaging long description. ([#115](https://github.com/sims-lab/CapCruncher/pull/115)) -- Added missing dependencies (seaborn and trackhub) to setup.cfg -- Fixed bug after updating pyyaml to latest version ([#122](https://github.com/sims-lab/CapCruncher/pull/122)) -- Re-partition reporter slices after filtering ([#124](https://github.com/sims-lab/CapCruncher/pull/124)) -- Fixed help option not being displayed when running capcruncher pipeline ([#129](https://github.com/sims-lab/CapCruncher/pull/129)) -- Fix issue with tasks going over their allotted number of cores. ([#133](https://github.com/sims-lab/CapCruncher/pull/133)) -- Fixes error during deduplication when using gzip compression ([#134](https://github.com/sims-lab/CapCruncher/pull/134)) -- Prevents excessive memory usage during duplicate removal ([#136](https://github.com/sims-lab/CapCruncher/pull/136)) -- Fix link common cooler tables ([#137](https://github.com/sims-lab/CapCruncher/pull/137)) -- Fixed an issue when no data exists for a viewpoint for a given sample ([#139](https://github.com/sims-lab/CapCruncher/pull/139)) - -### Co-authored-by - -- Kevin Rue-Albrecht -- Alsmith151 <49727900+alsmith151@users.noreply.github.com> -- Alsmith151 <49727900+alsmith151@users.noreply.github.com> - -### Documentation - -- Update README ([#120](https://github.com/sims-lab/CapCruncher/pull/120)) - -### Features - -- Moved all configuration from setup.py to setup.cfg. ([#114](https://github.com/sims-lab/CapCruncher/pull/114)) -- Reporter counting is now performed in parallel on separate partitions before collating. ([#117](https://github.com/sims-lab/CapCruncher/pull/117)) -- Enables pileup normalisation using a set of regions supplied as a bed file ([#121](https://github.com/sims-lab/CapCruncher/pull/121)) -- Enabled the use of custom filtering orders ([#119](https://github.com/sims-lab/CapCruncher/pull/119)) -- Capability to normalise pileups (bedgraphs/bigwigs) by a set of supplied regions. ([#125](https://github.com/sims-lab/CapCruncher/pull/125)) -- Expand the number of viewpoints that can be processed ([#128](https://github.com/sims-lab/CapCruncher/pull/128)) -- Enable optional compression during fastq split and deduplicate ([#131](https://github.com/sims-lab/CapCruncher/pull/131)) -- Reduces disk space required by pipeline by removing intermediate files ([#135](https://github.com/sims-lab/CapCruncher/pull/135)) -- Reduce disk space taken up by reporters (slices and counts) ([#138](https://github.com/sims-lab/CapCruncher/pull/138)) -- Improved report generation and fixed multiple report related bugs ([#132](https://github.com/sims-lab/CapCruncher/pull/132)) -- Revert without_cluster for reporter comparisons ([#140](https://github.com/sims-lab/CapCruncher/pull/140)) - -### REMOVED - -- Get_yaml.py and test_run_params.py -- Validate_annotations.py \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..682ddf4e --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +global-include *.smk +global-include Snakefile +global-include annotation_defaults.json +recursive-include capcruncher/pipeline/config * diff --git a/README.md b/README.md index 02ebd53f..621c9807 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ ## Analysis software for Capture-C, Tri-C and Tiled-C data. -CapCruncher is a tool designed to automate the processing of Capture-C, Tri-C and Tiled-C data from FASTQ files, the package is written in python and consists of an end-to-end data processing pipline together with a supporting command line interface to enable finer grained control. The pipeline provided is fast, robust and scales from a laptop to a computational cluster. +CapCruncher is a tool designed to automate the processing of Capture-C, Tri-C and Tiled-C data from FASTQ files, the package is written in python and consists of an end-to-end data processing pipline together with a supporting command line interface to enable finer grained control. The pipeline provided is fast, robust and scales from a laptop to a computational cluster. For further information see the [documentation](https://capcruncher.readthedocs.io/en/latest/) @@ -22,9 +22,9 @@ For further information see the [documentation](https://capcruncher.readthedocs. - Fixes Tiled-C filtering error due to typo in index specification for remove_dual_capture_fragments - Fixed bug when annotating Tiled-C data ([#166](https://github.com/sims-lab/CapCruncher/pull/166)) with the sorted option that caused no data to be annotated as as a viewpoint or exclusion. - Fixed a bug with Tiled-C slice filtering ([#166](https://github.com/sims-lab/CapCruncher/pull/166)) that caused slices to be erronously filtered. -- Fixed a bug with counting reporters in batches (this occurs when counting >1x106 slices per viewpoint) ([#166](https://github.com/sims-lab/CapCruncher/pull/166)) +- Fixed a bug with counting reporters in batches (this occurs when counting >1x106 slices per viewpoint) ([#166](https://github.com/sims-lab/CapCruncher/pull/166)) - Fixed a bug when merging and storing Tri-C or Tiled-C data ([#166](https://github.com/sims-lab/CapCruncher/pull/166)) using "capcruncher reporters store merge" or "capcruncher reporters store bins". This functionally has been re-written and now appears to work correctly. -- Fixed a bug with plotting matrices using capcrunchers plotting capabilities ([#166](https://github.com/sims-lab/CapCruncher/pull/166)). +- Fixed a bug with plotting matrices using capcrunchers plotting capabilities ([#166](https://github.com/sims-lab/CapCruncher/pull/166)). - Fixes bug where all slices are removed from parquet files (reporter) outputs due to an upgraded dask version (the pyarrow-dataset engine has been removed). This corrects the all dataframes are empty error occurring while generating reporter statistics. ### Features @@ -36,4 +36,4 @@ For further information see the [documentation](https://capcruncher.readthedocs. ### Miscellaneous Tasks - Updated version to 0.2.3 ([#165](https://github.com/sims-lab/CapCruncher/pull/165)) -- Pin conda dependency versions to speed up environment solver ([#167](https://github.com/sims-lab/CapCruncher/pull/167)) \ No newline at end of file +- Pin conda dependency versions to speed up environment solver ([#167](https://github.com/sims-lab/CapCruncher/pull/167)) diff --git a/capcruncher/__init__.py b/capcruncher/__init__.py index 90c6f79d..8efa5ae9 100644 --- a/capcruncher/__init__.py +++ b/capcruncher/__init__.py @@ -1 +1 @@ -from capcruncher import cli \ No newline at end of file +__all__ = ["api", "cli"] diff --git a/capcruncher/api/__init__.py b/capcruncher/api/__init__.py new file mode 100644 index 00000000..675ba0d7 --- /dev/null +++ b/capcruncher/api/__init__.py @@ -0,0 +1 @@ +__all__ = ["annotate", "count", "deduplicate", "digest", "filter", "io", "pileup", "plotting", "statistics", "storage"] diff --git a/capcruncher/api/annotate.py b/capcruncher/api/annotate.py new file mode 100644 index 00000000..50a5b45e --- /dev/null +++ b/capcruncher/api/annotate.py @@ -0,0 +1,96 @@ +import warnings +from typing import Union + +import pandas as pd +import pybedtools +import pyranges as pr +import ray +from loguru import logger + +from capcruncher.utils import convert_bed_to_pr + +warnings.simplefilter("ignore", category=RuntimeWarning) + + +@ray.remote +class BedFileIntersection: + def __init__( + self, + bed_a: Union[str, pybedtools.BedTool, pr.PyRanges], + bed_b: Union[str, pybedtools.BedTool, pr.PyRanges], + name: str = "b", + action: str = "get", + fraction: float = 1e-9, + ): + + self.a = bed_a + self.b = bed_b + self.name = name + self.action = action + self.fraction = fraction + + self.pr_a = convert_bed_to_pr(self.a, ignore_ray_objrefs=True) + + import logging + + logging.basicConfig(level=logging.INFO) + self.logger = logging.getLogger(__name__) + + def _get_intersection(self, pr_b: pr.PyRanges): + + intersection = ( + self.pr_a.join( + pr_b, + report_overlap=True, + ) + .assign("frac", lambda df: df.eval("Overlap / (End - Start)")) + .subset(lambda df: df["frac"] >= self.fraction) + .as_df() + ) + + dtype = pd.CategoricalDtype(pr_b.df["Name"].unique()) + + intersection_data = ( + intersection.set_index("Name")["Name_b"].astype(dtype).rename(self.name) + ) + + return intersection_data + + def _count_intersection(self, pr_b: pr.PyRanges): + + intersection_data = ( + self.pr_a.coverage(pr_b) + .df.query(f"NumberOverlaps > 0 and FractionOverlaps >= {self.fraction}") + .set_index("Name")["NumberOverlaps"] + .rename(self.name) + ) + + return intersection_data + + def intersection(self): + + try: + + pr_b = convert_bed_to_pr(self.b) + + if self.action == "get": + _intersection = self._get_intersection(pr_b) + elif self.action == "count": + _intersection = self._count_intersection(pr_b) + + except (OSError, IndexError, FileNotFoundError, StopIteration, AssertionError): + + self.logger.warning( + f"Could not intersect {self.b} using {self.action} method." + ) + _intersection = pd.Series( + data=pd.NA, + index=self.pr_a.df["Name"], + name=self.name, + dtype=object, + ) + + return _intersection + + def __repr__(self): + return f"{self.name} intersection" diff --git a/capcruncher/tools/count.py b/capcruncher/api/count.py similarity index 80% rename from capcruncher/tools/count.py rename to capcruncher/api/count.py index 52f06e68..f856be3f 100644 --- a/capcruncher/tools/count.py +++ b/capcruncher/api/count.py @@ -1,62 +1,58 @@ - -from typing import Union -import pandas as pd -from collections import defaultdict import itertools -import xopen import os +from collections import defaultdict + +import pandas as pd +from loguru import logger from tqdm import tqdm -import logging def get_fragment_combinations(df: pd.DataFrame): - return [sorted(comb) for comb in itertools.combinations(df["restriction_fragment"], 2)] + return [ + sorted(comb) for comb in itertools.combinations(df["restriction_fragment"], 2) + ] def subsample_reporters_from_df(df: pd.DataFrame, subsample: float): - logging.info("Subsampling data") + logger.info("Subsampling data") if isinstance(subsample, float): subsample_options = {"frac": subsample} elif isinstance(subsample, int): subsample_options = {"n": subsample} # Generate a subsample of fragments and slice these from the reporter dataframe - df_reporters = df[ - df["parent_id"].isin(df["parent_id"].sample(**subsample_options)) - ] + df_reporters = df[df["parent_id"].isin(df["parent_id"].sample(**subsample_options))] return df_reporters def remove_exclusions_from_df(df: pd.DataFrame): - #TODO: remove this slight dtype hack + # TODO: remove this slight dtype hack df = df.astype({"viewpoint": str}) - #df.loc[:, "viewpoint"] = df.loc[:, "viewpoint"].astype(str) + # df.loc[:, "viewpoint"] = df.loc[:, "viewpoint"].astype(str) return df.query("viewpoint != exclusion") - def preprocess_reporters_for_counting(df: pd.DataFrame, **kwargs): # Need to remove any restiction fragments that are not in the digested genome df_reporters = df.query("restriction_fragment != -1") if kwargs.get("remove_exclusions"): - logging.info("Removing excluded regions") + logger.info("Removing excluded regions") df_reporters = remove_exclusions_from_df(df_reporters) # Remove the capture site if kwargs.get("remove_viewpoints"): - logging.info("Removing viewpoints") - df_reporters = df_reporters.query('capture_count == 0') + logger.info("Removing viewpoints") + df_reporters = df_reporters.query("capture_count == 0") # Subsample at the fragment level if kwargs.get("subsample"): df_reporters = subsample_reporters_from_df(df_reporters, kwargs["subsample"]) - - return df_reporters + return df_reporters def count_re_site_combinations( @@ -82,7 +78,9 @@ def count_re_site_combinations( # For each set of ligated fragments for ii, (group_name, frag) in enumerate(tqdm(groups)): - for rf1, rf2 in itertools.combinations(frag[column], 2): # Get fragment combinations + for rf1, rf2 in itertools.combinations( + frag[column], 2 + ): # Get fragment combinations # TODO: Notice a high amount of multicaptures (same oligo) not being removed. # Need to track this down but for now will explicitly prevent the same bin appearing twice. if not rf1 == rf2: @@ -99,12 +97,12 @@ def get_counts_from_tsv_by_batch(reporters: os.PathLike, chunksize: int, **kwarg ligated_rf_counts = defaultdict(int) for ii, df_reporters in enumerate(df_reporters_iterator): - logging.info(f"Processing chunk #{ii+1} of {chunksize} slices") + logger.info(f"Processing chunk #{ii+1} of {chunksize} slices") reporters = preprocess_reporters_for_counting(df_reporters, **kwargs) fragments = reporters.groupby("parent_id") - logging.info("Counting") + logger.info("Counting") ligated_rf_counts = count_re_site_combinations( fragments, column="restriction_fragment", counts=ligated_rf_counts ) @@ -119,9 +117,9 @@ def get_counts_from_tsv(reporters: os.PathLike, **kwargs): reporters = preprocess_reporters_for_counting(df_reporters, **kwargs) fragments = reporters.groupby("parent_id") - logging.info("Counting") + logger.info("Counting") ligated_rf_counts = count_re_site_combinations( fragments, column="restriction_fragment" ) - return ligated_rf_counts \ No newline at end of file + return ligated_rf_counts diff --git a/capcruncher/api/deduplicate.py b/capcruncher/api/deduplicate.py new file mode 100644 index 00000000..e71b374e --- /dev/null +++ b/capcruncher/api/deduplicate.py @@ -0,0 +1,248 @@ +import functools +from loguru import logger +import multiprocessing +import os +import queue +from collections import namedtuple +from multiprocessing import Process +from typing import Iterable, Tuple + +import pandas as pd +import ujson +import xxhash +from capcruncher.utils import get_file_type, save_dict + + +class ReadDeduplicationParserProcess(Process): + """ + Process subclass for parsing fastq file(s) into a hashed {id:sequence} json format. + + Attributes: + inq: Input read queue + outq: Output read queue (Not currently used) + hash_seed: Seed for xxhash64 algorithm to ensure consistency + save_hash_dict_path: Path to save hashed dictionary + """ + + def __init__( + self, + inq: multiprocessing.Queue, + hash_seed: int = 42, + output_path: os.PathLike = "parsed.json", + ): + """ + Args: + inq (multiprocessing.SimpleQueue): Input queue for fastq reads. + outq (multiprocessing.SimpleQueue): Output queue for processed reads. + Only used if part of a pipeline + hash_seed (int, optional): Seed to use for hashing. Defaults to 42. + output_path (os.PathLike, optional): Path to save hashed reads. + """ + + self.inq = inq + self.hash_seed = hash_seed + self.output_path = output_path + + super(ReadDeduplicationParserProcess, self).__init__() + + def run(self): + """Processes fastq reads from multiple files and generates a hashed json dict. + + Dictionary is hashed and in the format {(read 1 name + read 2 name): (s1 + s2)} + + Output path is specified by save_hashed_dict_path. + + """ + + hash_seed = self.hash_seed + hash_function = functools.partial(xxhash.xxh64_intdigest, seed=hash_seed) + records = dict() + + while True: + + try: + reads = self.inq.get(block=True, timeout=0.01) + + if reads: + + for read_set in reads: + hash_sequence = hash_function( + "".join([r.sequence for r in read_set]) + ) + hash_id = hash_function("".join([r.name for r in read_set])) + records[hash_id] = hash_sequence + + else: + break + + except queue.Empty: + continue + + output_format = get_file_type(self.output_path) + save_dict(records, self.output_path, output_format) + + +RemovalStatistics = namedtuple( + "RemovalStatistics", ["reads_total", "reads_unique", "reads_removed"] +) + + +class ReadDuplicateRemovalProcess(Process): + """ + Process subclass for parsing fastq file(s) and removing identified duplicates. + + Attributes: + inq: Input read queue + outq: Output queue for deduplicated reads. + duplicated_ids: Concatenated read ids to remove from input fastq files. + statq: Output queue for statistics. + reads_total: Number of fastq reads processed. + reads_unique: Number of non-duplicated reads output. + hash_seed: Seed for xxhash algorithm. Same as ReadDuplicationParserProcess. + """ + + def __init__( + self, + inq: multiprocessing.Queue, + outq: multiprocessing.Queue, + stats_tx: multiprocessing.Pipe, + duplicated_ids: set, + hash_seed: int = 42, + hash_read_name: bool = True, + ): + """ + Args: + inq (multiprocessing.SimpleQueue): Input queue for reads to be deduplicated. + outq (multiprocessing.SimpleQueue): Output queue for deduplicated reads. + duplicated_ids (set): Hashed read ids to be removed if encountered. + statq (multiprocessing.Queue, optional): Output queue for statistics. + hash_seed (int, optional): Seed for xxhash algorithm. Defaults to 42. + """ + + self.inq = inq + self.outq = outq + self.hash_seed = hash_seed + self.duplicated_ids = duplicated_ids + + # Misc + self.hash_read_name = hash_read_name + + # Stats + self.stats_tx = stats_tx + self.reads_total = 0 + self.reads_unique = 0 + + super(ReadDuplicateRemovalProcess, self).__init__() + + def run(self): + + """Performs read deduplication based on sequence. + + Unique reads are placed on outq and deduplication stats are placed on statq. + + """ + + hash_seed = self.hash_seed + hash_read_name = self.hash_read_name + hash_function = functools.partial(xxhash.xxh64_intdigest, seed=hash_seed) + duplicated_ids = self.duplicated_ids + reads_unique = list() + + while True: + + try: + reads = self.inq.get(block=True, timeout=0.01) + + if reads: + for read_glob in reads: + + hash_id = hash_function("".join([r.name for r in read_glob])) + + if hash_id not in duplicated_ids: + if hash_read_name: + for r in read_glob: + r.name = str(hash_function(r.name)) + + reads_unique.append(read_glob) + + self.reads_total += len(reads) + self.reads_unique += len(reads_unique) + self.outq.put(reads_unique.copy()) + reads_unique.clear() + + else: + break + + except queue.Empty: + continue + + stats = RemovalStatistics( + self.reads_total, self.reads_unique, self.reads_total - self.reads_unique + ) + self.stats_tx.send(stats) + + +def remove_duplicates_from_parquet( + slices: Iterable, duplicated_ids: pd.Series, output: os.PathLike +) -> Tuple[int, int]: + + import dask.dataframe as dd + import pyarrow.dataset as ds + + if not duplicated_ids.empty: + duplicates = set(duplicated_ids.values) + else: + duplicates = set() + + n_reads_total = ( + dd.read_parquet(slices, columns=["parent_id"], engine="pyarrow")["parent_id"] + .nunique() + .compute() + ) + + logger.info("Loading and filtering slices") + + # Load and filter data + slice_dataset = ds.dataset( + list(slices), + format="parquet", + ) + + slice_dataset_scanner = slice_dataset.scanner( + filter=~ds.field("parent_id").isin(duplicates) + ) + + logger.info("Writing unique slices") + ds.write_dataset( + slice_dataset_scanner, output, format="parquet", partitioning_flavor="hive" + ) + + n_reads_unique = ( + dd.read_parquet(output, columns=["parent_id"], engine="pyarrow")["parent_id"] + .nunique() + .compute() + ) + return (n_reads_total, n_reads_unique) + + +def read_duplicated_ids(path: os.PathLike): + + from xopen import xopen + + file_type = get_file_type(path) + + if file_type == "json": + with xopen.xopen(path, "r") as r: + ids_duplicated = {int(x) for x in ujson.load(r)} + + elif file_type == "hdf5": + + try: + ids_duplicated = pd.read_hdf(path, key="/duplicated_ids") + except KeyError: + ids_duplicated = pd.Series(data=["NO_DATA"], name="/duplicated_ids") + + elif file_type == "pickle": + ids_duplicated = pd.read_pickle(path) + + return ids_duplicated diff --git a/capcruncher/tools/digest.py b/capcruncher/api/digest.py similarity index 90% rename from capcruncher/tools/digest.py rename to capcruncher/api/digest.py index 0a53d089..8aab3c70 100644 --- a/capcruncher/tools/digest.py +++ b/capcruncher/api/digest.py @@ -1,14 +1,53 @@ -import logging +from loguru import logger import queue import re import pysam import multiprocessing import numpy as np -from typing import Iterable, Union, List -from capcruncher.tools.statistics import DigestionStats +from typing import Iterable, List +from capcruncher.api.statistics import DigestionStats import pandas as pd +def get_re_site(recognition_site: str = None) -> str: + + """ + Obtains the recogniton sequence for a supplied restriction enzyme or correctly + formats a supplied recognition sequence. + + Args: + cut_sequence - DNA sequence to use for fasta digestion e.g. "GATC" + restriction_enzyme - Name of restriction enzyme e.g. DpnII (case insensitive) + + Returns: + recognition sequence e.g. "GATC" + + Raises: + ValueError: Error if restriction_enzyme is not in known enzymes + + """ + + known_enzymes = { + "dpnii": "GATC", + "mboi": "GATC", + "hindiii": "AAGCTT", + "ecori": "GAATTC", + "nlaiii": "CATG", + } + + if re.match(r"[GgAaTtCc]+", recognition_site): # matches a DNA sequence + cutsite = recognition_site.upper() # Just uppercase convert and return + + elif recognition_site.lower() in known_enzymes: + cutsite = known_enzymes[recognition_site.lower()] + + else: + logger.error("No restriction site or recognised enzyme provided") + raise ValueError("No restriction site or recognised enzyme provided") + + return cutsite + + class DigestedChrom: """ Performs in slico digestion of fasta files. @@ -273,7 +312,7 @@ def __init__( self.read_type = digestion_kwargs.get("read_type", "flashed") self._stat_container = DigestionStats - if not "cutsite" in digestion_kwargs: + if "cutsite" not in digestion_kwargs: raise KeyError("Cutsite is required to be present in digestion arguments") def _digest_reads(self, reads, **digestion_kwargs): @@ -375,7 +414,9 @@ def run(self): try: df_stats = pd.concat(dframes_stats) df_stats_aggregated = ( - df_stats.groupby(["read_type", "read_number", "unfiltered", "filtered"]) + df_stats.groupby( + ["read_type", "read_number", "unfiltered", "filtered"] + ) .sum() .reset_index() ) diff --git a/capcruncher/tools/filter.py b/capcruncher/api/filter.py similarity index 93% rename from capcruncher/tools/filter.py rename to capcruncher/api/filter.py index 4bf626f8..2cb44ee8 100644 --- a/capcruncher/tools/filter.py +++ b/capcruncher/api/filter.py @@ -1,4 +1,4 @@ -import logging +from loguru import logger import pandas as pd import os import numpy as np @@ -79,7 +79,16 @@ def __init__( """ self._has_required_columns = self._required_columns_present(slices) - self.slices = slices.sort_values(["parent_read", "slice"]) + + # Tweak format slices dataframe to be consistent + self.slices = slices.sort_values(["parent_read", "slice"]).assign( + blacklist=lambda df: df["blacklist"].astype(float), + restriction_fragment=lambda df: df["restriction_fragment"].astype( + pd.Int64Dtype() + ), + capture_count=lambda df: df["capture_count"].fillna(0), + exclusion_count=lambda df: df["exclusion_count"].fillna(0), + ) if filter_stages: self.filter_stages = self._extract_filter_stages(filter_stages) @@ -92,7 +101,6 @@ def __init__( self.current_filter = "" def _required_columns_present(self, df) -> bool: - columns_required = [ "parent_id", "slice_name", @@ -112,7 +120,7 @@ def _required_columns_present(self, df) -> bool: ] for col in columns_required: - if not col in df.columns: + if col not in df.columns: raise KeyError(f'Required column "{col}" not in slices dataframe') return True @@ -130,7 +138,6 @@ def _extract_filter_stages(self, filter_stages) -> dict: elif os.path.exists(filter_stages) and ( ".yaml" in filter_stages or ".yml" in filter_stages ): - import yaml with open(filter_stages, "r") as f: @@ -144,7 +151,7 @@ def _extract_filter_stages(self, filter_stages) -> dict: all_filters = itertools.chain.from_iterable(filters.values()) for filt in all_filters: - if not filt in self.filters: + if filt not in self.filters: raise AttributeError( f"Required filter: {filt} not present. Check for correct spelling and format." ) @@ -259,14 +266,19 @@ def filter_slices(self, output_slices=False, output_location="."): for stage, filters in self.filter_stages.items(): for filt in filters: - - self.current_filter = filt - # Call all of the filters in the filter_stages dict in order - logging.info(f"Filtering slices: {filt}") - getattr(self, filt)() # Gets and calls the selected method - logging.info(f"Completed: {filt}") - logging.info(f"Number of slices: {self.slices.shape[0]}") - logging.info(f'Number of reads: {self.slices["parent_read"].nunique()}') + try: + self.current_filter = filt + # Call all of the filters in the filter_stages dict in order + logger.info(f"Filtering slices: {filt}") + getattr(self, filt)() # Gets and calls the selected method + logger.info(f"Completed: {filt}") + logger.info(f"Number of slices: {self.slices.shape[0]}") + logger.info( + f'Number of reads: {self.slices["parent_read"].nunique()}' + ) + except Exception as e: + logger.error(f"Exception {e} raised during {filt} filtering") + raise e if output_slices == "filter": self.slices.to_csv(os.path.join(output_location, f"{filt}.tsv.gz")) @@ -335,7 +347,6 @@ def remove_duplicate_slices(self): """ - frags_deduplicated = ( self.slices.groupby("parent_id") .agg(coords=("coordinates", "|".join)) @@ -359,7 +370,6 @@ def remove_duplicate_slices_pe(self): if ( self.slices["pe"].iloc[:100].str.contains("pe").sum() > 1 ): # at least one un-flashed - fragments_partial = ( self.slices.groupby("parent_id") .agg(coords=("coordinates", "|".join)) @@ -393,19 +403,23 @@ def remove_excluded_slices(self): """Removes any slices in the exclusion region (default 1kb) (V. Common)""" slices_with_viewpoint = self.slices_with_viewpoint - slices_passed = slices_with_viewpoint.loc[lambda df: (df["exclusion_count"] < 1) | (df["exclusion"] != df["viewpoint"])] - + slices_passed = slices_with_viewpoint.loc[ + lambda df: (df["exclusion_count"] < 1) + | (df["exclusion"] != df["viewpoint"]) + ] + self.slices = self.slices.loc[ lambda df: df["parent_id"].isin(slices_passed["parent_id"]) ] def remove_blacklisted_slices(self): """Removes slices marked as being within blacklisted regions""" - self.slices = self.slices.loc[lambda df: (df["blacklist"] == 0) | (df["blacklist"].isna())] + self.slices = self.slices.loc[ + lambda df: (df["blacklist"] == 0) | (df["blacklist"].isna()) + ] @property def slices_with_viewpoint(self): - slices = self.slices.set_index("parent_id") captures = self.captures.set_index("parent_id") return ( @@ -514,11 +528,6 @@ def fragments(self) -> pd.DataFrame: ) ) - # df["unique_capture_sites"] = ( - # df["unique_capture_sites"] - 1 - # ) # nunique identifies '.' as a capture site - # df["unique_exclusions"] = df["unique_exclusions"] - 1 # as above - # Add the number of reporters to the dataframe. # Only consider a reporter if at least one capture slice is present # in the fragment. @@ -681,12 +690,11 @@ def remove_non_reporter_fragments(self): Removes the fragment if it has no reporter slices present (Common) """ - fragments_partial = self.slices.groupby("parent_id").agg( n_capture=("capture_count", "sum"), n_mapped=("mapped", "sum"), n_blacklist=("blacklist", "sum"), - n_exclusions=("exclusion_count", "sum"), + n_exclusions=("exclusion_count", lambda ser: ser.sum()), ) fragments_with_reporters = fragments_partial.query( @@ -734,7 +742,15 @@ def remove_viewpoint_adjacent_restriction_fragments(self, n_adjacent: int = 1): """ - slices_with_viewpoint = self.slices_with_viewpoint + slices_with_viewpoint = self.slices_with_viewpoint[ + [ + "restriction_fragment", + "capture", + "capture_count", + "viewpoint", + "parent_id", + ] + ] # Create a per viewpoint dataframe of adjacent fragment ranges restriction_fragments_viewpoint = ( @@ -760,9 +776,9 @@ def remove_viewpoint_adjacent_restriction_fragments(self, n_adjacent: int = 1): "(exclusion_start <= restriction_fragment <= exclusion_end) and (capture_count == 0)" ) - self.slices = ( - self.slices.loc[lambda df: ~df["parent_id"].isin(excluded_slices["parent_id"])] - ) + self.slices = self.slices.loc[ + lambda df: ~df["parent_id"].isin(excluded_slices["parent_id"]) + ] class TriCSliceFilter(CCSliceFilter): @@ -798,7 +814,6 @@ class TriCSliceFilter(CCSliceFilter): filter_stats (pd.DataFrame): Provides statistics of read filtering.""" def __init__(self, slices, filter_stages=None, **sample_kwargs): - if filter_stages: self.filter_stages = filter_stages else: @@ -874,7 +889,6 @@ class TiledCSliceFilter(SliceFilter): """ def __init__(self, slices, filter_stages=None, **sample_kwargs): - if not filter_stages: filter_stages = { "pre-filtering": [ @@ -892,11 +906,11 @@ def __init__(self, slices, filter_stages=None, **sample_kwargs): "remove_duplicate_slices", "remove_duplicate_slices_pe", ], - "has_reporter": ["remove_orphan_slices"], + "has_reporter": ["remove_orphan_slices", "remove_religation"], } super(TiledCSliceFilter, self).__init__(slices, filter_stages, **sample_kwargs) - + @property def captures(self) -> pd.DataFrame: """ @@ -933,7 +947,6 @@ def fragments(self) -> pd.DataFrame: @property def slice_stats(self): - slices = self.slices.copy() if slices.empty: # Deal with empty dataframe i.e. no valid slices for col in slices: @@ -961,7 +974,6 @@ def slice_stats(self): ) return stats_df - @property def cis_or_trans_stats(self) -> pd.DataFrame: """ @@ -982,7 +994,6 @@ def cis_or_trans_stats(self) -> pd.DataFrame: for capture_site, df_cap in self.slices.query("capture_count == 1").groupby( "capture" ): - capture_chrom = df_cap.iloc[0]["chrom"] df_primary_capture = df_cap.groupby( "parent_read" @@ -1052,3 +1063,13 @@ def remove_dual_capture_fragments(self): .loc[~multicapture_fragments] .reset_index() ) + + def remove_religation(self): + frag_comp = ( + self.slices.sort_values(["restriction_fragment"]) + .groupby("parent_id")["restriction_fragment"] + .transform("diff") + .fillna(-1) + ) + not_religated = (frag_comp > 1) | (frag_comp == -1) + self.slices = self.slices.loc[not_religated] diff --git a/capcruncher/tools/io.py b/capcruncher/api/io.py similarity index 95% rename from capcruncher/tools/io.py rename to capcruncher/api/io.py index 14a9dd79..d8c04acc 100644 --- a/capcruncher/tools/io.py +++ b/capcruncher/api/io.py @@ -1,5 +1,5 @@ import glob -import logging +from loguru import logger import multiprocessing import os import pathlib @@ -9,15 +9,14 @@ import traceback from typing import Literal, Union -import numpy as np import pandas as pd import pysam import tqdm -from capcruncher.tools.count import ( +from capcruncher.api.count import ( get_fragment_combinations, preprocess_reporters_for_counting, ) -from capcruncher.utils import get_timing, hash_column +from capcruncher.utils import get_timing from pysam import FastxFile from xopen import xopen import xxhash @@ -84,17 +83,17 @@ def run(self): if read_counter % self.read_buffer == 0 and not read_counter == 0: self.outq.put(buffer.copy()) buffer.clear() - logging.info(f"{read_counter} reads parsed (batch)") + logger.info(f"{read_counter} reads parsed (batch)") rc = read_counter else: rc = read_counter self.outq.put(buffer) # Deal with remainder self.outq.put_nowait(None) # Poison pill to terminate queue - logging.info(f"{rc} reads parsed (final)") + logger.info(f"{rc} reads parsed (final)") except Exception as e: - logging.info(f"Reader failed with exception: {e}") + logger.info(f"Reader failed with exception: {e}") raise finally: @@ -143,7 +142,7 @@ def run(self): self.outq.put("END") - except Exception as e: + except Exception: traceback.format_exc() self.outq.put("END") @@ -229,7 +228,7 @@ def run(self): reads = self.inq.get() self.n_files_written += 1 - except Exception as e: + except Exception: traceback.format_exc() @@ -364,7 +363,7 @@ def parse_alignment(aln) -> CCAlignment: multimapped = 1 else: multimapped = 0 - + return CCAlignment( slice_id=slice_id, slice_name=slice_name, @@ -407,6 +406,7 @@ def parse_bam(bam): """ # Load reads into dataframe + logger.info("Parsing BAM file") df_bam = pd.DataFrame( [ parse_alignment(aln) @@ -415,6 +415,7 @@ def parse_bam(bam): ) # Perform dtype conversions + logger.info("Converting dtypes") df_bam["chrom"] = df_bam["chrom"].astype("category") pe_category = pd.CategoricalDtype(["flashed", "pe"]) df_bam["pe"] = df_bam["pe"].astype( @@ -422,6 +423,8 @@ def parse_bam(bam): ) # Only the one type present so need to include both df_bam.set_index(["slice_name", "chrom", "start"], inplace=True) + + logger.info("Finished parsing BAM file") return df_bam @@ -576,7 +579,7 @@ def run(self): df = self._select_by_viewpoint_batch(viewpoints_to_find) for vp, df_vp in df.groupby("viewpoint"): if not df_vp.empty: - logging.info(f"Queuing {vp} for counting") + logger.info(f"Queuing {vp} for counting") self.outq.put((vp, df_vp)) elif ( @@ -591,7 +594,9 @@ def run(self): for vp, df_vp in df.groupby("viewpoint"): if not df_vp.empty: vp_partitions[vp] += 1 - logging.info(f"Queuing {vp} partition {vp_partitions[vp]} for counting") + logger.info( + f"Queuing {vp} partition {vp_partitions[vp]} for counting" + ) self.outq.put((vp, df_vp)) except queue.Empty: @@ -706,9 +711,9 @@ def run(self): if self.restriction_fragment_map and self.viewpoint_path: - from capcruncher.tools.storage import create_cooler_cc - - logging.info(f"Making temporary cooler for {vp}") + from capcruncher.api.storage import create_cooler_cc + + logger.info(f"Making temporary cooler for {vp}") create_cooler_cc( output_prefix=self.path, pixels=df, @@ -786,9 +791,9 @@ def run(self): if self.restriction_fragment_map and self.viewpoint_path: - from capcruncher.tools.storage import create_cooler_cc + from capcruncher.api.storage import create_cooler_cc - logging.info(f"Making temporary cooler for {vp}") + logger.info(f"Making temporary cooler for {vp}") create_cooler_cc( output_prefix=path, pixels=df, diff --git a/capcruncher/tools/pileup.py b/capcruncher/api/pileup.py similarity index 84% rename from capcruncher/tools/pileup.py rename to capcruncher/api/pileup.py index 6c2378ec..d3429200 100644 --- a/capcruncher/tools/pileup.py +++ b/capcruncher/api/pileup.py @@ -1,13 +1,14 @@ import pandas as pd import numpy as np -from pandas.core.base import DataError from pybedtools import BedTool import cooler -from typing import Literal, Union -from capcruncher.tools.storage import CoolerBinner +from typing import Literal +from capcruncher.api.storage import CoolerBinner from capcruncher.utils import is_valid_bed -import os -import logging +from loguru import logger +import ray +import re +import pyranges as pr class CoolerBedGraph: @@ -21,7 +22,13 @@ class CoolerBedGraph: """ - def __init__(self, uri: str, sparse: bool = True, only_cis: bool = False, region_to_limit: str = None): + def __init__( + self, + uri: str, + sparse: bool = True, + only_cis: bool = False, + region_to_limit: str = None, + ): """ Args: uri (str): Path to cooler group in hdf5 file. @@ -30,15 +37,14 @@ def __init__(self, uri: str, sparse: bool = True, only_cis: bool = False, region self._sparse = sparse self._only_cis = only_cis - logging.info(f"Reading {uri}") + logger.info(f"Reading {uri}") self._cooler = cooler.Cooler(uri) self.viewpoint_name = self._cooler.info["metadata"]["viewpoint_name"] self._viewpoint_bins = self._cooler.info["metadata"]["viewpoint_bins"] self.viewpoint_chrom = self._cooler.info["metadata"]["viewpoint_chrom"][0] self.n_cis_interactions = self._cooler.info["metadata"]["n_cis_interactions"] - logging.info(f"Processing {self.viewpoint_name}") + logger.info(f"Processing {self.viewpoint_name}") - if only_cis: self._bins = self._cooler.bins().fetch(self.viewpoint_chrom) viewpoint_chrom_bins = self._bins["name"] @@ -50,7 +56,7 @@ def __init__(self, uri: str, sparse: bool = True, only_cis: bool = False, region ) ) self._bins = self._cooler.bins().fetch(self.viewpoint_chrom) - + elif region_to_limit: self._pixels = self._cooler.pixels().fetch(region_to_limit) self._bins = self._cooler.bins().fetch(region_to_limit) @@ -59,14 +65,18 @@ def __init__(self, uri: str, sparse: bool = True, only_cis: bool = False, region self._pixels = self._cooler.pixels()[:] # TODO: Avoid this if possible as reading all bins into memory self._bins = self._cooler.bins()[:] - + # Ensure name column is present - self._bins = self._bins.assign(name=lambda df: df.index) if not "name" in self._bins.columns else self._bins + self._bins = ( + self._bins.assign(name=lambda df: df.index) + if "name" not in self._bins.columns + else self._bins + ) self._reporters = None def _get_reporters(self): - logging.info("Extracting reporters") + logger.info("Extracting reporters") concat_ids = pd.concat([self._pixels["bin1_id"], self._pixels["bin2_id"]]) concat_ids_filt = concat_ids.loc[lambda ser: ser.isin(self._viewpoint_bins)] pixels = self._pixels.loc[concat_ids_filt.index] @@ -91,9 +101,10 @@ def _get_reporters(self): ) def extract_bedgraph( - self, normalisation: Literal["raw", "n_cis", "region"] = "raw", **norm_kwargs) -> pd.DataFrame: + self, normalisation: Literal["raw", "n_cis", "region"] = "raw", **norm_kwargs + ) -> pd.DataFrame: - logging.info("Generating bedgraph") + logger.info("Generating bedgraph") df_bdg = ( self._bins.merge( self.reporters, @@ -106,10 +117,9 @@ def extract_bedgraph( ) if not normalisation == "raw": - logging.info("Normalising bedgraph") + logger.info("Normalising bedgraph") self.normalise_bedgraph(df_bdg, method=normalisation, **norm_kwargs) - return df_bdg @property @@ -148,9 +158,7 @@ def normalise_bedgraph( self._normalise_by_regions(bedgraph, scale_factor, region) def _normalise_by_n_cis(self, bedgraph, scale_factor: float): - bedgraph["count"] = ( - bedgraph["count"] / self.n_cis_interactions - ) * scale_factor + bedgraph["count"] = (bedgraph["count"] / self.n_cis_interactions) * scale_factor def _normalise_by_regions(self, bedgraph, scale_factor: float, regions: str): @@ -180,7 +188,6 @@ def _normalise_by_regions(self, bedgraph, scale_factor: float, regions: str): bedgraph["count"] = (bedgraph["count"] / total_counts_in_region) * scale_factor - class CoolerBedGraphWindowed(CoolerBedGraph): def __init__( self, @@ -368,3 +375,39 @@ def __truediv__(self, other): else: return NotImplementedError() + + +@ray.remote +def cooler_to_bedgraph( + clr: str, regions_of_interest: str = None, viewpoint_distance: int = None, **kwargs +) -> pd.DataFrame: + + if viewpoint_distance: + viewpoint_coords = cooler.Cooler(clr).info["metadata"]["viewpoint_coords"][0] + viewpoint_coords = re.split("[:-]", viewpoint_coords) + region_to_limit = f"{viewpoint_coords[0]}:{int(viewpoint_coords[1]) - viewpoint_distance}-{int(viewpoint_coords[1]) + viewpoint_distance}" + else: + region_to_limit = None + + norm = kwargs.get("normalisation", "raw") + scale_factor = kwargs.get("scale_factor", 1e6) + region = kwargs.get("region", None) + + bedgraph = CoolerBedGraph(clr, region_to_limit=region_to_limit).extract_bedgraph( + normalisation=norm, + scale_factor=scale_factor, + region=region, + ) + + if regions_of_interest: + pr_roi = pr.read_bed(regions_of_interest) + pr_bedgraph = bedgraph.rename( + columns={"chrom": "Chromosome", "start": "Start", "end": "End"} + ).pipe(pr.PyRanges) + pr_bedgraph = pr_bedgraph.join(pr_roi, strandedness="same") + + bedgraph = pr_bedgraph.df.rename( + columns={"Chromosome": "chrom", "Start": "start", "End": "end"} + )[["chrom", "start", "end", "score"]] + + return bedgraph diff --git a/capcruncher/tools/plotting.py b/capcruncher/api/plotting.py similarity index 72% rename from capcruncher/tools/plotting.py rename to capcruncher/api/plotting.py index 5cd0ab0c..e3e0bb19 100644 --- a/capcruncher/tools/plotting.py +++ b/capcruncher/api/plotting.py @@ -1,26 +1,32 @@ -import math import os -from typing import Union - -import coolbox.api as cb -import cooler -import iced -import matplotlib -import matplotlib.colors as colors +import math +import pathlib import numpy as np import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import matplotlib as mpl + +import pyranges as pr +import coolbox.api as cb +from pybedtools import BedTool + from coolbox.api import GenomeRange +import capcruncher.api as cc +import cooler.api as cooler +import matplotlib.colors as colors +import iced from coolbox.core.track import Track from coolbox.utilities import get_coverage_stack, get_feature_stack from matplotlib import cm, transforms from matplotlib.colors import LinearSegmentedColormap from matplotlib.patches import Polygon -from pybedtools import BedTool -from scipy.ndimage import interpolation +from typing import List, Tuple, Union, Optional, Dict, Any, Callable, Literal +from loguru import logger -class CCMatrix(cb.Cool): +class CCMatrix(cb.Cool): def __init__( self, file: os.PathLike, @@ -29,7 +35,6 @@ def __init__( remove_viewpoint=False, **kwargs, ): - self.binsize = binsize self.viewpoint = viewpoint self.remove_viewpoint = remove_viewpoint @@ -38,10 +43,10 @@ def __init__( self.properties["name"] = f"CCMatrix.{self.properties.get('title')}" super(CCMatrix, self).__init__(file, **kwargs) # Need to override the coolbox default if we need a cmap to be set - self.properties['color'] = kwargs.get('color', self.properties['color']) + self.properties["color"] = kwargs.get("color", self.properties["color"]) # Override the defaults - self.properties['balance'] = 'no' + self.properties["balance"] = "no" if not self._cooler_store_has_binsize: raise ValueError( @@ -51,8 +56,6 @@ def __init__( self.cooler = cooler.Cooler(f"{file}::{viewpoint}/resolutions/{binsize}") self.capture_bins = self.cooler.info["metadata"]["viewpoint_bins"] - - def _cooler_store_has_binsize(self): clrs = cooler.fileops.list_coolers(self.file) expected_path = f"{self.viewpoint}/resolutions/{self.binsize}" @@ -75,7 +78,6 @@ def get_matrix(self, coordinates, field="count"): def get_matrix_normalised( self, coordinates, normalization_method=None, **normalisation_kwargs ): - methods_stored = { "n_interactions": "count_n_interactions_norm", "n_rf_n_interactions": "count_n_rf_n_interactions_norm", @@ -92,7 +94,7 @@ def get_matrix_normalised( elif normalization_method == "ice": matrix = self.get_matrix(coordinates) matrix = np.nan_to_num(matrix) - #matrix = iced.filter.filter_low_counts(matrix, percentage=0.04) + # matrix = iced.filter.filter_low_counts(matrix, percentage=0.04) matrix_normalised = iced.normalization.ICE_normalization( matrix, **normalisation_kwargs ) # Get iced matrix @@ -103,10 +105,12 @@ def get_matrix_normalised( matrix_ice = iced.normalization.ICE_normalization( matrix, **normalisation_kwargs ) # Get iced matrix - matrix_normalised = matrix_ice / int( - self.cooler.info["metadata"]["n_cis_interactions"] - ) * 1e6 # Correct for number of interactions * 1e6 - + matrix_normalised = ( + matrix_ice + / int(self.cooler.info["metadata"]["n_cis_interactions"]) + * 1e6 + ) # Correct for number of interactions * 1e6 + elif normalization_method == "icen_scale": matrix = self.get_matrix(coordinates) matrix = np.nan_to_num(matrix) @@ -122,22 +126,21 @@ def get_matrix_normalised( return matrix_normalised - def fetch_data(self, gr: cb.GenomeRange, gr2: cb.GenomeRange = None, **kwargs) -> np.ndarray: - + def fetch_data( + self, gr: cb.GenomeRange, gr2: cb.GenomeRange = None, **kwargs + ) -> np.ndarray: norm = self.properties.get("normalization", "raw") - matrix = self.get_matrix_normalised( + matrix = self.get_matrix_normalised( f"{gr.chrom}:{gr.start}-{gr.end}", normalization_method=norm, **kwargs ) return self.fill_zero_nan(matrix) def plot_matrix(self, gr: GenomeRange, gr2: GenomeRange = None): - # Code taken and adapted from coolbox gr = GenomeRange(gr) - - if 'JuiceBox' in self.properties["color"]: - cmap = CCMatrix.get_juicebox_cmaps()[self.properties['color']] + if "JuiceBox" in self.properties["color"]: + cmap = CCMatrix.get_juicebox_cmaps()[self.properties["color"]] else: cmap = cm.get_cmap(self.properties["color"]) @@ -149,7 +152,7 @@ def plot_matrix(self, gr: GenomeRange, gr2: GenomeRange = None): arr = self.matrix c_min, c_max = self.matrix_val_range - if self.properties['max_value'] == 'auto': + if self.properties["max_value"] == "auto": matrix_triu = np.triu(self.matrix) c_max = np.percentile(matrix_triu, 98) @@ -174,7 +177,7 @@ def plot_matrix(self, gr: GenomeRange, gr2: GenomeRange = None): transform=tr + ax.transData, extent=(gr.start, gr.end, gr.start, gr.end), aspect="auto", - interpolation='none' + interpolation="none", ) elif gr2 is None and self.style == self.STYLE_WINDOW: @@ -199,7 +202,6 @@ def plot_matrix(self, gr: GenomeRange, gr2: GenomeRange = None): transform=tr + ax.transData, extent=(gr.start, gr.end, gr.start, gr.end), aspect="auto", - ) else: if gr2 is None: @@ -218,33 +220,34 @@ def plot_matrix(self, gr: GenomeRange, gr2: GenomeRange = None): img.set_norm(colors.Normalize(vmin=c_min, vmax=c_max)) return img - + @staticmethod def get_juicebox_cmaps(): JuiceBoxLikeColor = LinearSegmentedColormap.from_list( - 'interaction', ['#FFFFFF', '#FFDFDF', '#FF7575', '#FF2626', '#F70000']) + "interaction", ["#FFFFFF", "#FFDFDF", "#FF7575", "#FF2626", "#F70000"] + ) JuiceBoxLikeColor.set_bad("white") JuiceBoxLikeColor.set_under("white") JuiceBoxLikeColor2 = LinearSegmentedColormap.from_list( - 'interaction', ['#FFFFFF', '#FFDFAF', '#FF7555', '#FF2600', '#F70000']) + "interaction", ["#FFFFFF", "#FFDFAF", "#FF7555", "#FF2600", "#F70000"] + ) JuiceBoxLikeColor2.set_bad("white") JuiceBoxLikeColor2.set_under("white") return { "JuiceBoxLike": JuiceBoxLikeColor, - "JuiceBoxLike2": JuiceBoxLikeColor2,} + "JuiceBoxLike2": JuiceBoxLikeColor2, + } class CCBigWig(cb.BigWig): def __init__(self, file, **kwargs): - self.file = file self.coverages = [] super(CCBigWig, self).__init__(file, **kwargs) def fetch_data(self, gr, **kwargs): - if not self.properties["style"] == "fragment": data = super(CCBigWig, self).fetch_data(gr, **kwargs) else: @@ -254,9 +257,9 @@ def fetch_data(self, gr, **kwargs): def plot_fragments(self, ax, gr, **kwargs): data = self.fetch_data(gr, **kwargs) - alpha = self.properties.get("alpha", 1.0) - threshold = self.properties.get("threshold", 0) - offset = gr.start + _alpha = self.properties.get("alpha", 1.0) + _threshold = self.properties.get("threshold", 0) + _offset = gr.start bp_proportion = 1 / (data["end"].max() - data["start"].min()) for row in data.itertuples(): @@ -277,7 +280,6 @@ def plot_fragments(self, ax, gr, **kwargs): self.plot_label() def plot(self, ax, gr, **kwargs): - if not self.properties["style"] == "fragment": super(CCBigWig, self).plot(ax, gr, **kwargs) else: @@ -285,7 +287,6 @@ def plot(self, ax, gr, **kwargs): class CCBigWigCollection(Track): - DEFAULT_PROPERTIES = { "style": "line", "fmt": "-", @@ -302,10 +303,9 @@ class CCBigWigCollection(Track): } def __init__(self, file: list, exclusions: str = None, **kwargs): - self.file_names = file self.exclusions = exclusions - self.bws = [cb.BigWig(fn) for fn in file] + self.bws = [cb.BigWig(str(fn)) for fn in file] self.properties = {"files": self.file_names} self.properties.update(CCBigWigCollection.DEFAULT_PROPERTIES.copy()) self.properties.update(kwargs) @@ -325,7 +325,6 @@ def __init__(self, file: list, exclusions: str = None, **kwargs): self.coverages.append(coverage) def fetch_data(self, gr, **kwargs): - datasets = [ bw.bw.fetch_intervals(gr.chrom, gr.start, gr.end) .set_index(["chrom", "start", "end"]) @@ -368,7 +367,6 @@ def fetch_data(self, gr, **kwargs): return df_intervals def fetch_exluded_regions(self, gr): - excluded_tabix = BedTool(self.exclusions).tabix(force=True) df_excluded = excluded_tabix.tabix_intervals( f"{gr.chrom}:{gr.start}-{gr.end}" @@ -389,7 +387,6 @@ def fetch_exluded_regions(self, gr): return df_intervals def plot(self, ax, gr, **kwargs): - data = self.fetch_data(gr, **kwargs) line_width = self.properties.get("line_width", 1) @@ -422,16 +419,14 @@ def plot(self, ax, gr, **kwargs): zorder=1, ) - min_val = self.properties.get("min_value") max_val = self.properties.get("max_value") - ymin = round(scores.min()) if min_val == "auto" else min_val - ymax = round(scores.max() + data["sem"].max()) if max_val == "auto" else max_val + ymin = round(scores.min()) if min_val == "auto" else min_val + ymax = round(scores.max() + data["sem"].max()) if max_val == "auto" else max_val ax.set_xlim(gr.start, gr.end) ax.set_ylim(ymin, ymax) - self.plot_data_range(ax, ymin, ymax, self.properties["data_range_style"], gr) self.plot_label() @@ -443,7 +438,7 @@ def plot_data_range(self, ax, ymin, ymax, data_range_style, gr: cb.GenomeRange): try: y_ax = self.y_ax self.plot_yaxis_range(ax, y_ax) - except AttributeError as e: + except AttributeError: self.plot_data_range(ax, ymin, ymax, "text", gr) def plot_yaxis_range(self, plot_axis, y_ax): @@ -503,7 +498,9 @@ def plot_text_range(self, ax, ymin, ymax, gr: cb.GenomeRange): ydelta = ymax - ymin # set min max - format_lim = lambda lim: int(lim) if float(lim) % 1 == 0 else f"{lim:.2f}" + def format_lim(lim): + return int(lim) if float(lim) % 1 == 0 else f"{lim:.2f}" + ymax_print = format_lim(ymax) ymin_print = format_lim(ymin) small_x = 0.01 * gr.length @@ -528,7 +525,6 @@ def fetch_data(self, **kwargs): pass def get_appropriate_scale(self, length): - if length <= 1e3: scale = 1e2 elif 1e3 < length < 1e4: @@ -545,7 +541,6 @@ def get_appropriate_scale(self, length): return scale def plot(self, ax, gr, **kwargs): - position = self.properties.get("position", "left") y_midpoint = 0.5 @@ -587,16 +582,14 @@ def plot(self, ax, gr, **kwargs): ax.set_ylim(0, 1) -class SimpleBed(cb.BED): +class CCSimpleBed(cb.BED): def __init__(self, file: str, **kwargs): - self.file = file self.properties = dict() self.properties["name"] = "BlockBed" self.properties.update(kwargs) def fetch_data(self, gr): - bt = BedTool(self.file) bt_tabix = bt.tabix(force=True) @@ -605,7 +598,6 @@ def fetch_data(self, gr): ).to_dataframe() def plot(self, ax, gr, **kwargs): - data = self.fetch_data(gr) y_midpoint = 0.5 @@ -637,24 +629,186 @@ def plot(self, ax, gr, **kwargs): ax.set_xlim(gr.start, gr.end) ax.set_ylim(0, 1) -class XAxisGenomic(cb.XAxis): + +class CCXAxisGenomic(cb.XAxis): def __init__(self, **kwargs): - super(XAxisGenomic, self).__init__() + super(CCXAxisGenomic, self).__init__() self.properties.update(kwargs) - + def plot(self, ax, gr: GenomeRange, **kwargs): self.ax = ax ax.set_xlim(gr.start, gr.end) ticks = np.linspace(gr.start, gr.end, 10) - labels = [f'{x:,.0f}' for x in ticks] + labels = [f"{x:,.0f}" for x in ticks] ax.axis["x"] = ax.new_floating_axis(0, 0.5) ax.axis["x"].axis.set_ticks(ticks) ax.axis["x"].axis.set_ticklabels(labels) - ax.axis['x'].axis.set_tick_params(which='minor', bottom='on') + ax.axis["x"].axis.set_tick_params(which="minor", bottom="on") ax.axis["x"].major_ticklabels.set(size=10) - if 'where' in self.properties and self.properties['where'] == 'top': + if "where" in self.properties and self.properties["where"] == "top": ax.axis["x"].set_axis_direction("top") + + +class CCTrack: + def __init__( + self, + file, + file_type: Literal[ + "heatmap", + "bigwig", + "bigwig_summary", + "scale", + "bed", + "xaxis", + "genes", + "spacer", + ] = None, + **kwargs, + ): + self.file = file + self.properties = dict() + self.properties.update(kwargs) + + if file_type: + self.properties["type"] = file_type + elif self.properties.get("type"): + pass + else: + raise ValueError( + "Please specify file_type as one of: heatmap, bigwig, bigwig_summary, scale, bed, xaxis, genes, spacer" + ) + + def get_track(self): + match self.properties.get("type"): # noqa + case "heatmap": + assert ( + "binsize" in self.properties + ), "Binsize must be specified for heatmap track (e.g. binsize=5000)" + return CCMatrix(self.file, **self.properties) + case "bigwig": + if self.properties.get("overlay"): + return cb.BigWigCoverage(self.file, **self.properties) + else: + return CCBigWig(self.file, **self.properties) + case "bigwig_summary": + return CCBigWigCollection(self.file, **self.properties) + case "scale": + return ScaleBar(**self.properties) + case "bed": + return CCSimpleBed(self.file, **self.properties) + case "xaxis": + return CCXAxisGenomic(**self.properties) + case "genes": + if self.properties.get("title"): + del self.properties["title"] + return cb.BED(self.file, **self.properties) + case "spacer": + return cb.Spacer(**self.properties) + case _: + raise ValueError( + f"Unknown track type {self.properties.get('type')}, select from: heatmap, bigwig, bigwig_summary, scale, bed, xaxis, genes, spacer" + ) + + +class CCFigure: + def __init__( + self, tracks: List[CCTrack] = None, auto_spacing: bool = False, **kwargs + ) -> None: + + self.frame = cb.Frame() + self.auto_spacing = auto_spacing + self.properties = dict() + self.properties.update(kwargs) + + if tracks: + self.tracks = set(tracks) + self.add_tracks(tracks) + else: + self.tracks = set() + + def add_track(self, track: CCTrack) -> None: + self.tracks.add(track) + self.frame.add_track(track.get_track()) + + def add_tracks(self, tracks: List[CCTrack]) -> None: + for track in tracks: + if self.auto_spacing: + spacer = CCTrack(None, file_type="spacer") + self.add_track(spacer.get_track()) + + self.add_track(track) + + def plot( + self, gr: GenomeRange, gr2: GenomeRange = None, show: bool = True, **kwargs + ) -> None: + if gr2: + fig = self.frame.plot(gr, gr2, **kwargs) + else: + fig = self.frame.plot(gr, **kwargs) + if show: + fig.show() + + return fig + + def save( + self, gr: GenomeRange, gr2: GenomeRange = None, output: str = None, **kwargs + ) -> None: + fig = self.plot(gr, gr2, show=False, **kwargs) + if output: + fig.savefig(output, dpi=300) + else: + fig.savefig(f"{gr.chrom}_{gr.start}_{gr.end}.png", dpi=300) + + @classmethod + def from_toml(cls, toml_file: os.PathLike, **kwargs) -> "CCFigure": + import toml + + with open(toml_file) as f: + config = toml.load(f) + + tracks = [] + for track_name, attr in config.items(): + file = attr.pop("file") if attr.get("file") else None + track_name = attr.pop("title") if attr.get("title") else track_name + tracks.append(CCTrack(file, title=track_name, **attr)) + return cls(tracks, **kwargs) + + @classmethod + def from_frame(cls, frame: cb.Frame, **kwargs) -> "CCFigure": + tracks = [] + for track in frame.tracks: + tracks.append(CCTrack(track.properties["file"], **track.properties)) + + return cls(tracks, **kwargs) + + def to_toml(self, output: str = None) -> Union[None, Dict[str, Any]]: + import toml + + config = dict() + for track in self.tracks: + if track.properties.get("type") in ["spacer", "scale", "xaxis"]: + track_type = track.properties.get("type") + n = config.get(track_type, 0) + config[f"{track_type} {n}"] = track.properties + config[f"{track_type} {n}"]["file"] = None + elif track.properties.get("type") == "genes": + track_type = track.properties.get("type") + n = config.get(track_type, 0) + config[f"{track_type} {n}"] = track.properties + config[f"{track_type} {n}"]["file"] = track.file + else: + config[track.properties["title"]] = track.properties + config[track.properties["title"]]["file"] = track.file + + if output: + with open(output, "w") as f: + toml.dump(config, f) + else: + with open("config.toml", "w") as f: + toml.dump(config, f) + + return config diff --git a/capcruncher/api/statistics.py b/capcruncher/api/statistics.py new file mode 100644 index 00000000..a1d525c2 --- /dev/null +++ b/capcruncher/api/statistics.py @@ -0,0 +1,114 @@ +import pandas as pd +import re +from collections import namedtuple +from capcruncher.utils import read_dataframes + + +class DeduplicationStatistics: + def __init__( + self, + sample: str, + read_type: str = "pe", + reads_total: int = 0, + reads_unique: int = 0, + ): + + self.sample = sample + self.read_type = read_type + self.reads_total = reads_total + self.reads_unique = reads_unique + self.reads_removed = reads_total - reads_unique + + @property + def df(self): + df = pd.DataFrame() + df["stat"] = [self.reads_total, self.reads_unique, self.reads_removed] + df["stat_type"] = ["reads_total", "reads_unique", "reads_removed"] + df["read_type"] = self.read_type + df["read_number"] = 0 + df["stage"] = "deduplication" + df["sample"] = self.sample + return df + + +DigestionStats = namedtuple( + "DigestionStats", ["read_type", "read_number", "unfiltered", "filtered"] +) + + +def collate_histogram_data(fnames): + return ( + pd.concat(read_dataframes(fnames)) + .groupby(["sample", "read_type", "read_number", "filtered", "n_slices"]) + .sum() + .reset_index() + .sort_values(["sample", "read_type", "n_slices"]) + ) + + +def collate_read_data(fnames): + + df = ( + pd.concat(read_dataframes(fnames)) + .query("(read_number == 0) or (read_number == 1)") + .groupby(["sample", "stage", "read_type", "read_number", "stat_type"])["stat"] + .sum() + .reset_index() + ) + + return df.sort_values(["sample", "stat"], ascending=[True, False]) + + +def collate_slice_data(fnames): + + df = pd.concat(read_dataframes(fnames)) + aggregations = { + col: "sum" if "unique" not in col else "max" + for col in df.columns + if col not in ["sample", "stage", "read_type"] + } + + return ( + df.groupby(["sample", "stage", "read_type"]) + .agg(aggregations) + .reset_index() + .sort_values(["sample", "unique_slices"], ascending=[True, False]) + ) + + +def collate_cis_trans_data(fnames): + return ( + pd.concat(read_dataframes(fnames)) + .groupby(["sample", "viewpoint", "read_type", "cis/trans"]) + .sum() + .reset_index() + .sort_values(["sample", "read_type", "count"], ascending=[True, True, False]) + ) + + +def extract_trimming_stats(fn): + stat_regexes = { + "reads_total": re.compile(r"^Total reads processed:\s+([0-9,]+)$"), + "adapters_removed": re.compile(r"Reads with adapters:\s+([0-9,]+).*"), + "reads_after_filtering": re.compile( + r"Reads written \(passing filters\):\s+([0-9,]+).*" + ), + } + + sample_re_match = re.match(r".*/(.*)_part\d+_(1|2).*", fn) + + stats = {} + stats["sample"] = sample_re_match.group(1) + stats["read_number"] = sample_re_match.group(2) + stats["read_type"] = "pe" + + with open(fn) as r: + for line in r: + for stat_name, pattern in stat_regexes.items(): + regex_match = pattern.match(line) + if regex_match: + stats[stat_name] = int(regex_match.group(1).replace(",", "")) + + stats["reads_filtered"] = stats["reads_total"] - stats["reads_after_filtering"] + + return stats diff --git a/capcruncher/api/storage.py b/capcruncher/api/storage.py new file mode 100644 index 00000000..67df4a15 --- /dev/null +++ b/capcruncher/api/storage.py @@ -0,0 +1,550 @@ +import os +import tempfile +import pandas as pd +import numpy as np +from pybedtools import BedTool +import cooler +import h5py +import functools +import itertools +from loguru import logger +import ujson +from typing import Iterable, Tuple, Union, List, Dict, Literal +import pyranges as pr +import re + + +def get_viewpoint_coords(viewpoint_file: str, viewpoint_name: str): + df_viewpoints = BedTool(viewpoint_file).to_dataframe() + df_viewpoints = df_viewpoints.query(f'name == "{viewpoint_name}"') + + try: + viewpoints = [row for index, row in df_viewpoints.iterrows()] + except IndexError: + logger.error("Oligo name cannot be found within viewpoints") + viewpoints = None + + return viewpoints + + +def get_viewpoint_bins(bins, viewpoint_chrom, viewpoint_start, viewpoint_end): + + return [ + int(b) + for b in bins.query( + f'chrom == "{viewpoint_chrom}" and start >= {viewpoint_start} and end <= {viewpoint_end}' + )["name"] + ] + + +def create_cooler_cc( + output_prefix: str, + bins: pd.DataFrame, + pixels: pd.DataFrame, + viewpoint_name: str, + viewpoint_path: os.PathLike, + viewpoint_bins: Union[int, list] = None, + assay: Literal["capture", "tri", "tiled"] = "capture", + suffix=None, + **cooler_kwargs, +) -> os.PathLike: + """ + Creates a cooler hdf5 file or cooler formatted group within a hdf5 file. + + Args: + output_prefix (str): Output path for hdf5 file. If this already exists, will append a new group to the file. + bins (pd.DataFrame): DataFrame containing the genomic coordinates of all bins in the pixels table. + pixels (pd.DataFrame): DataFrame with columns: bin1_id, bin2_id, count. + viewpoint_name (str): Name of viewpoint to store. + viewpoint_path (os.PathLike): Path to viewpoints used for the analysis. + viewpoint_bins (Union[int, list], optional): Bins containing viewpoint. Can be determined from viewpoint_path. Defaults to None. + suffix (str, optional): Suffix to append before the .hdf5 file extension. Defaults to None. + + Raises: + ValueError: Viewpoint name must exactly match the a supplied viewpoint. + + Returns: + os.PathLike: Path of cooler hdf5 file. + """ + + # Gets viewpoint coordinates + viewpoint_coords = get_viewpoint_coords(viewpoint_path, viewpoint_name) + + # Make sure viewpoint coordinates are returned correctly, if not, error. + if viewpoint_coords is None: + raise ValueError(f"Incorrect viewpoint name specified: {viewpoint_name}.") + + # If viewpoint bins not provided get them using the coordinates. + if not viewpoint_bins: + viewpoint_bins = list( + itertools.chain.from_iterable( + [ + get_viewpoint_bins(bins, c["chrom"], c["start"], c["end"]) + for c in viewpoint_coords + ] + ) + ) + + # Need to store bins as a list so make sure its not just a single int. + elif isinstance(viewpoint_bins, int): + viewpoint_bins = [ + int(viewpoint_bins), + ] + + # The cooler.create_cooler function will not accept np.arrays so must convert to python list + elif isinstance(viewpoint_bins, (np.array, pd.Series)): + viewpoint_bins = [int(x) for x in viewpoint_bins] + + # Get the number of cis interactions, required for normalisation. + bins_cis = bins.loc[ + lambda df: df["chrom"].isin([c["chrom"] for c in viewpoint_coords]) + ]["name"] + + if assay in [ + "capture", + "tri", + ]: # If capture or tri, remove viewpoint bins from cis bins + bins_cis = bins_cis.loc[lambda ser: ~ser.isin(viewpoint_bins)] + + pixels_cis = pixels.loc[ + lambda df: (df["bin1_id"].isin(bins_cis)) | (df["bin2_id"].isin(bins_cis)) + ] + n_cis_interactions = pixels_cis["count"].sum() + + # Metadata for cooler file. + metadata = { + "viewpoint_bins": viewpoint_bins, + "viewpoint_name": viewpoint_name, + "viewpoint_chrom": [c["chrom"] for c in viewpoint_coords], + "viewpoint_coords": [ + f'{c["chrom"]}:{c["start"]}-{c["end"]}' for c in viewpoint_coords + ], + "n_cis_interactions": int(n_cis_interactions), + "n_total_interactions": int(pixels["count"].sum()), + } + + if os.path.exists( + output_prefix + ): # Will append to a prexisting file if one is supplied + append_to_file = True + cooler_fn = f"{output_prefix}::/{viewpoint_name}" + else: + append_to_file = False + cooler_fn = f"{output_prefix.replace('.hdf5', '')}{'.' + suffix if suffix else ''}.hdf5::/{viewpoint_name}" + + cooler.create_cooler( + cooler_fn, + bins=bins, + pixels=pixels, + metadata=metadata, + mode="w" if not append_to_file else "a", + **cooler_kwargs, + ) + + return cooler_fn + + +class CoolerBinner: + def __init__( + self, + cooler_group: os.PathLike, + binsize: int = None, + method: Union[Literal["overlap"], Literal["midpoint"]] = "midpoint", + minimum_overlap: float = 0.51, + n_cis_interaction_correction: bool = True, + n_rf_per_bin_correction: bool = True, + scale_factor: int = 1_000_000, + ) -> None: + self.cooler_group = cooler_group + self.binsize = binsize + self.method = method + self.minimum_overlap = minimum_overlap + + if isinstance(cooler_group, str): + self.cooler = cooler.Cooler(cooler_group) + elif isinstance(cooler_group, cooler.Cooler): + self.cooler = cooler_group + else: + raise ValueError( + "cooler_group must be a path to a cooler file or a cooler object" + ) + + self.n_cis_interactions = self.cooler.info["metadata"]["n_cis_interactions"] + self.n_cis_interaction_correction = n_cis_interaction_correction + self.n_restriction_fragment_correction = n_rf_per_bin_correction + self.scale_factor = scale_factor + + @functools.cached_property + def genomic_bins(self) -> pr.PyRanges: + return ( + cooler.binnify(binsize=self.binsize, chromsizes=self.cooler.chromsizes) + .sort_values(by=["chrom", "start", "end"]) + .assign( + genomic_bin_id=lambda df: df.reset_index(drop=True) + .index.to_series() + .values + ) + .rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"}) + .pipe(pr.PyRanges) + ) + + @functools.cached_property + def fragment_bins(self): + return ( + self.cooler.bins()[:] + .rename( + columns={ + "chrom": "Chromosome", + "start": "Start", + "end": "End", + "name": "fragment_id", + } + ) + .pipe(pr.PyRanges) + ) + + @functools.cached_property + def fragment_to_genomic_table(self) -> pr.PyRanges: + """ + Translate genomic bins to fragment bins + """ + + fragment_bins = self.fragment_bins + + if self.method == "midpoint": + fragment_bins = ( + fragment_bins.as_df() + .assign( + Start=lambda df: df["Start"] + (df["End"] - df["Start"]) / 2, + End=lambda df: df["Start"] + 1, + ) + .pipe(pr.PyRanges) + ) + + pr_fragment_to_bins = self.genomic_bins.join( + fragment_bins, strandedness=0, how=None, report_overlap=True + ) + + if self.method == "overlap": + pr_fragment_to_bins = pr_fragment_to_bins[ + pr_fragment_to_bins["Overlap"] >= self.minimum_overlap + ] + + # Add number of fragments per bin + pr_fragment_to_bins = pr_fragment_to_bins.assign( + "n_fragments_per_bin", + lambda df: df.groupby("genomic_bin_id")["fragment_id"].transform("nunique"), + ) + + return pr_fragment_to_bins + + @functools.cached_property + def fragment_to_genomic_mapping(self) -> Dict[int, int]: + """ + Translate genomic bins to fragment bins + """ + fragment_to_bins_mapping = ( + self.fragment_to_genomic_table.as_df() + .set_index("fragment_id")["genomic_bin_id"] + .to_dict() + ) + return fragment_to_bins_mapping + + @functools.cached_property + def pixels(self) -> pd.DataFrame: + """ + Translate fragment pixels to genomic pixels + """ + + fragment_to_bins_mapping = self.fragment_to_genomic_mapping + + pixels = self.cooler.pixels()[:].assign( + genomic_bin1_id=lambda df: df["bin1_id"].map(fragment_to_bins_mapping), + genomic_bin2_id=lambda df: df["bin2_id"].map(fragment_to_bins_mapping), + ) + + # Sum the counts of pixels that map to the same genomic bins + pixels = ( + pixels.groupby(["genomic_bin1_id", "genomic_bin2_id"]) + .agg( + count=("count", "sum"), + ) + .reset_index() + ) + + # Normalize pixels if specified + if self.n_restriction_fragment_correction: + n_fragments_per_bin = ( + self.fragment_to_genomic_table.as_df() + .set_index("genomic_bin_id")["n_fragments_per_bin"] + .to_dict() + ) + pixels = pixels.assign( + n_fragments_per_bin1=lambda df: df["genomic_bin1_id"].map( + n_fragments_per_bin + ), + n_fragments_per_bin2=lambda df: df["genomic_bin2_id"].map( + n_fragments_per_bin + ), + n_fragments_per_bin_correction=lambda df: ( + df["n_fragments_per_bin1"] + df["n_fragments_per_bin2"] + ), + count_n_rf_norm=lambda df: df["count"] + / df["n_fragments_per_bin_correction"], + ) + + if self.n_cis_interaction_correction: + pixels = pixels.assign( + count_n_cis_norm=lambda df: (df["count"] / self.n_cis_interactions) + * self.scale_factor, + ) + + if self.n_cis_interaction_correction and self.n_restriction_fragment_correction: + pixels = pixels.assign( + count_n_cis_rf_norm=lambda df: ( + pixels["count_n_rf_norm"] / self.n_cis_interactions + ) + * self.scale_factor + ) + + return pixels + + @functools.cached_property + def viewpoint_bins(self) -> List[int]: + """ + Return list of viewpoint bins + """ + + pr_viewpoint = pr.from_dict( + dict( + zip( + ["Chromosome", "Start", "End"], + [ + [ + x, + ] + for x in re.split( + ":|-", self.cooler.info["metadata"]["viewpoint_coords"][0] + ) + ], + ) + ) + ) + + return pr_viewpoint.join(self.genomic_bins).df["genomic_bin_id"].to_list() + + def to_cooler(self, store: os.PathLike): + metadata = {**self.cooler.info["metadata"]} + metadata["viewpoint_bins"] = [int(x) for x in self.viewpoint_bins] + metadata["n_interactions_total"] = int(self.cooler.pixels()[:]["count"].sum()) + cooler_fn = f"{store}::/{metadata['viewpoint_name']}/resolutions/{self.binsize}" + + pixels = ( + self.pixels.drop( + columns=[ + "bin1_id", + "bin2_id", + "n_fragments_per_bin1", + "n_fragments_per_bin2", + "n_fragments_per_bin_correction", + ], + errors="ignore", + ) + .rename( + columns={"genomic_bin1_id": "bin1_id", "genomic_bin2_id": "bin2_id"} + ) + .loc[:, lambda df: ["bin1_id", "bin2_id", "count", *df.columns[3:]]] + .sort_values(by=["bin1_id", "bin2_id"]) + ) + + bins = ( + self.genomic_bins.df.rename( + columns={"Chromosome": "chrom", "Start": "start", "End": "end"} + ) + .sort_values("genomic_bin_id") + .assign(bin_id=lambda df: df["genomic_bin_id"]) + .set_index("genomic_bin_id") + ) + + cooler.create_cooler( + cooler_fn, + bins=bins, + pixels=pixels, + metadata=metadata, + mode="w" if not os.path.exists(store) else "a", + columns=pixels.columns[2:], + dtypes=dict(zip(pixels.columns[2:], ["float32"] * len(pixels.columns[2:]))), + ensure_sorted=True, + ordered=True, + ) + + return cooler_fn + + +def link_common_cooler_tables(clr: os.PathLike): + """Reduces cooler storage space by linking "bins" table. + + All of the cooler "bins" tables containing the genomic coordinates of each bin + are identical for all cooler files of the same resoultion. As cooler.create_cooler + generates a new bins table for each cooler, this leads to a high degree of duplication. + + This function hard links the bins tables for a given resolution to reduce the degree of duplication. + + Args: + clr (os.PathLike): Path to cooler hdf5 produced by the merge command. + """ + + logger.info("Making links to common cooler tables to conserve disk space") + + with h5py.File(clr, "a") as f: + + # Get all viewpoints stored + viewpoints = sorted(list(f.keys())) + + # Get all resolutions stored + try: + resolutions = [res for res in f[viewpoints[0]]["resolutions"]] + except (KeyError, IndexError): + resolutions = None + + for viewpoint in viewpoints[1:]: + + try: + # Delete currenly stored bins group and replace with link to first viewpoint "bins" group + del f[viewpoint]["bins"] + f[viewpoint]["bins"] = f[viewpoints[0]]["bins"] + + # Delete chroms table and replace with link to the first "chroms" group + del f[viewpoint]["chroms"] + f[viewpoint]["chroms"] = f[viewpoints[0]]["chroms"] + except KeyError: + pass + + # Repeat for resolutions i.e. binned coolers + if resolutions: + for resolution in resolutions: + del f[viewpoint]["resolutions"][resolution]["bins"] + f[viewpoint]["resolutions"][resolution]["bins"] = f[viewpoints[0]][ + "resolutions" + ][resolution]["bins"] + + del f[viewpoint]["resolutions"][resolution]["chroms"] + f[viewpoint]["resolutions"][resolution]["chroms"] = f[ + viewpoints[0] + ]["resolutions"][resolution]["chroms"] + + +def get_merged_cooler_metadata(coolers: Iterable[os.PathLike]): + """ + Merges metadata from multiple coolers. + """ + # Get metadata from all coolers and copy to the merged file + metadata = {} + for cooler_uri in coolers: + filepath, group = cooler_uri.split("::") + + with h5py.File(filepath, mode="r") as src: + metadata_src = ujson.decode(src[group].attrs["metadata"]) + + for metadata_key, metadata_value in metadata_src.items(): + + if isinstance(metadata_value, str): + metadata[metadata_key] = metadata_value + + elif isinstance(metadata_value, Iterable): + if metadata_key not in metadata: + metadata[metadata_key] = [] + metadata[metadata_key].extend(metadata_value) + else: + metadata[metadata_key].extend( + [ + v + for v in metadata_value + if v not in metadata[metadata_key] + ] + ) + + elif isinstance(metadata_value, (int, float)): + if metadata_key not in metadata: + metadata[metadata_key] = metadata_value + else: + metadata[metadata_key] += metadata_value + + return metadata + + +def merge_coolers(coolers: Tuple, output: os.PathLike): + """ + Merges capcruncher cooler files together. + + Produces a unified cooler with both restriction fragment and genomic bins whilst + reducing the storage space required by hard linking the "bins" tables to prevent duplication. + + Args: + coolers (Tuple): Cooler files produced by either the fragments or bins subcommands. + output (os.PathLike): Path from merged cooler file. + """ + from collections import defaultdict + import cooler + + logger.info("Merging cooler files") + + coolers_to_merge = defaultdict(list) + + # Remove output file as need to append to it. + if os.path.exists(output): + os.unlink(output) + + # Extract a list of coolers to merge, grouped by viewpoint name + for clr in coolers: + with h5py.File(clr, mode="r") as src: + viewpoints = list(src.keys()) + + for viewpoint in viewpoints: + if "resolutions" not in list(src[viewpoint].keys()): + coolers_to_merge[viewpoint].append(f"{clr}::/{viewpoint}") + else: + for resolution in src[viewpoint]["resolutions"].keys(): + coolers_to_merge[f"{viewpoint}::{resolution}"].append( + f"{clr}::/{viewpoint}/resolutions/{resolution}" + ) + + # Initial pass to perform copying for all coolers without a matching group + need_merging = list() + with h5py.File(output, mode="w") as dest: + for ii, (viewpoint, cooler_uris) in enumerate(coolers_to_merge.items()): + + if len(cooler_uris) < 2: # Only merge if two or more, else just copy + (file_path, group_path) = cooler_uris[0].split("::") + + with h5py.File(file_path, mode="r") as src: + src.copy(src[group_path], dest, group_path) + + else: + need_merging.append(viewpoint) + + # Actually merge the coolers left over that do have duplicates + for viewpoint in need_merging: + tmp = tempfile.NamedTemporaryFile().name + cooler_uris = coolers_to_merge[viewpoint] + cooler.merge_coolers( + f"{tmp}::/{viewpoint.replace('::', '/resolutions/')}", + cooler_uris, + mergebuf=int(1e6), + ) + + with h5py.File(tmp, mode="r") as src: + with h5py.File(output, mode="a") as dest: + dest.copy( + src[viewpoint.replace("::", "/resolutions/")], dest, viewpoint + ) + + metadata = get_merged_cooler_metadata(cooler_uris) + + with h5py.File(output, mode="a") as dest: + dest[viewpoint.replace("::", "/resolutions/")].attrs[ + "metadata" + ] = ujson.encode(metadata) + + # Reduce space by linking common tables (bins, chroms) + link_common_cooler_tables(output) diff --git a/capcruncher/cli/__init__.py b/capcruncher/cli/__init__.py index 62ac7188..8ca11d69 100644 --- a/capcruncher/cli/__init__.py +++ b/capcruncher/cli/__init__.py @@ -1,19 +1,7 @@ import click -import os from functools import cached_property from importlib import import_module, metadata -import subprocess -import warnings -import logging -import sys - - -# create logger -logger = logging.getLogger("capcruncher") -logger.setLevel(logging.INFO) -logging.basicConfig( - format="%(levelname)s:%(asctime)s %(module)-20s %(message)s", level=logging.INFO -) +from loguru import logger CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"]} @@ -63,16 +51,6 @@ def cli(): An end to end solution for processing: Capture-C, Tri-C and Tiled-C data. """ -from capcruncher.cli.cli_pipeline import pipeline - -# @cli.command() -# def pipeline(**kwargs): -# """ -# CapCruncher automated data processing pipeline methods. -# """ -# from capcruncher.cli.cli_pipeline import pipeline -# pipeline(**kwargs) - @cli.group(cls=LazyGroup, import_name="capcruncher.cli.cli_fastq:cli") def fastq(): @@ -93,8 +71,8 @@ def alignments(): """Alignment annotation, identification and deduplication.""" -@cli.group(cls=LazyGroup, import_name="capcruncher.cli.cli_reporters:cli") -def reporters(): +@cli.group(cls=LazyGroup, import_name="capcruncher.cli.cli_interactions:cli") +def interactions(): """Reporter counting, storing, comparison and pileups""" @@ -110,6 +88,10 @@ def utilities(): """Contains miscellaneous functions""" +# Finally, import the pipeline command from the pipeline module +import capcruncher.cli.cli_pipeline + + __all__ = [ "alignments_annotate", "alignments_deduplicate", diff --git a/capcruncher/cli/alignments_annotate.py b/capcruncher/cli/alignments_annotate.py index df72b423..b00285f5 100644 --- a/capcruncher/cli/alignments_annotate.py +++ b/capcruncher/cli/alignments_annotate.py @@ -1,19 +1,28 @@ import itertools -import logging +from loguru import logger +import os import sys import warnings from typing import Tuple, Union import numpy as np -from joblib.parallel import Parallel, delayed +import pandas as pd +import pysam +import ray +from pybedtools import BedTool, MalformedBedLineError + +from capcruncher.api.annotate import BedFileIntersection +from capcruncher.utils import ( + bed_has_name, + convert_bed_to_dataframe, + convert_bed_to_pr, + is_valid_bed, +) warnings.simplefilter("ignore") -import os -import pandas as pd -from capcruncher.tools.annotate import BedIntersection -from capcruncher.utils import bed_has_name, convert_bed_to_dataframe, is_valid_bed -from pybedtools import BedTool, MalformedBedLineError + +pysam.set_verbosity(0) def cycle_argument(arg): @@ -90,24 +99,19 @@ def remove_duplicates_from_bed( ) -def get_intersection(intersector: BedIntersection): - return intersector.get_intersection() - - def annotate( slices: os.PathLike, actions: Tuple = None, bed_files: Tuple = None, names: Tuple = None, - dtypes: Tuple[bool] = None, overlap_fractions: Tuple = None, output: os.PathLike = None, duplicates: str = "remove", n_cores: int = 1, - invalid_bed_action: str = "error", blacklist: str = "", prioritize_cis_slices: bool = False, priority_chroms: str = "", + **kwargs, ): """ Annotates a bed file with other bed files using bedtools intersect. @@ -137,7 +141,7 @@ def annotate( NotImplementedError: Only supported option for duplicate bed names is remove. """ - logging.info("Validating commandline arguments") + logger.info("Validating commandline arguments") len_bed_files = len(bed_files) if not all([len(arg) == len_bed_files for arg in [actions, names]]): raise ValueError( @@ -145,19 +149,19 @@ def annotate( ) if slices == "-": - logging.info("Reading slices from stdin") + logger.info("Reading slices from stdin") slices = pd.read_csv(sys.stdin, sep="\t", header=None).pipe( BedTool.from_dataframe ) elif slices.endswith(".bam"): - logging.info("Converting bam to bed") + logger.info("Converting bam to bed") slices = BedTool(slices).bam_to_bed() else: slices = BedTool(slices) - logging.info("Validating input bed file before annotation") + logger.info("Validating input bed file before annotation") if not is_valid_bed(slices): raise ValueError(f"bed - {slices} is invalid") @@ -165,83 +169,70 @@ def annotate( raise ValueError(f"bed - {slices} does not have a name column") if blacklist: - logging.info("Removing blacklisted regions from the bed file") + logger.info("Removing blacklisted regions from the bed file") try: slices = slices - BedTool(blacklist) except (MalformedBedLineError, FileNotFoundError, IndexError) as e: - logging.error(f"Blacklist {blacklist} bedfile raised {e}. Ensure it is correctly formatted") - + logger.error( + f"Blacklist {blacklist} bedfile raised {e}. Ensure it is correctly formatted" + ) - logging.info("Dealing with duplicates in the bed file") + logger.info("Dealing with duplicates in the bed file") # Deal with multimapping reads. if duplicates == "remove": - slices = remove_duplicates_from_bed(slices, - prioritize_cis_slices=prioritize_cis_slices, - chroms_to_prioritize=priority_chroms.split(",") if priority_chroms else None) + slices = remove_duplicates_from_bed( + slices, + prioritize_cis_slices=prioritize_cis_slices, + chroms_to_prioritize=priority_chroms.split(",") + if priority_chroms + else None, + ) else: raise NotImplementedError( "Only supported option at present is to remove duplicates" ) - - logging.info("Ensuring bed file is sorted") - slices = slices.sort() - logging.info("Performing intersection") - intersections_to_perform = [] - for bed, name, action, fraction, dtype in zip( + logger.info("Setting-up intersection(s)") + ray.init(num_cpus=n_cores, ignore_reinit_error=True, include_dashboard=False) + pr_slices = convert_bed_to_pr(slices) + pr_slices_ref = ray.put(pr_slices) + + futures = [] + for bed, name, action, fraction in zip( bed_files, names, actions, cycle_argument(overlap_fractions), - cycle_argument(dtypes), ): - intersections_to_perform.append( - BedIntersection( - bed1=slices, - bed2=bed, - intersection_name=name, - intersection_method=action, - intersection_min_frac=fraction, - invalid_bed_action=invalid_bed_action, - dtype=dtype, - ) + logger.info(f"Setting-up intersection for {bed}") + bfi = BedFileIntersection.remote( + bed_a=pr_slices, + bed_b=bed, + name=name, + action=action, + fraction=fraction, ) + futures.append(bfi.intersection.remote()) - logging.debug(intersections_to_perform) + # Collate results + df_annotation = pr_slices.df.set_index("Name") - intersections_results = Parallel(n_jobs=n_cores)( - delayed(get_intersection)(intersection) - for intersection in intersections_to_perform - ) + while len(futures): + done_id, futures = ray.wait(futures) - logging.info("Merging annotations") + for ref in done_id: + ser_new = ray.get(ref) + df_annotation = df_annotation.join(ser_new, how="left") + del ser_new - # Merge intersections with slices + # Format dataframe for next stage df_annotation = ( - convert_bed_to_dataframe(slices) - .rename(columns={"name": "slice_name"}) + df_annotation.reset_index() + .rename(columns={"Name": "slice_name"}) .set_index("slice_name") - .sort_index() - .join(intersections_results, how="left") .reset_index() - .rename(columns={"index": "slice_name"}) ) - del intersections_results - logging.info("Writing annotations to file.") - - df_annotation.loc[ - :, lambda df: df.select_dtypes("number").columns - ] = df_annotation.select_dtypes("number").astype("float") - - # Export to tsv - if output.endswith(".tsv"): - df_annotation.to_csv(output, sep="\t", index=False) - elif output.endswith(".hdf5"): - # Need to convert dtypes to ones that are supported - df_annotation.to_hdf( - output, key="/annotation", format="table", complib="blosc", complevel=2 - ) - elif output.endswith(".parquet"): - df_annotation.to_parquet(output, compression="snappy") + logger.info("Writing annotations to file.") + df_annotation.to_parquet(output, compression="snappy") diff --git a/capcruncher/cli/alignments_deduplicate.py b/capcruncher/cli/alignments_deduplicate.py deleted file mode 100644 index 7b0fabd6..00000000 --- a/capcruncher/cli/alignments_deduplicate.py +++ /dev/null @@ -1,208 +0,0 @@ -import logging -from typing import Iterable, List, Literal, Tuple -import dask -import pandas as pd -import xopen -import ujson -import os -import numpy as np -import dask.distributed - -from capcruncher.tools.deduplicate import ( - identify_coordinate_duplicates_from_tsv, - identify_coordinate_duplicates_from_hdf5, - identify_coordinate_duplicates_from_parquet, - remove_duplicates_from_parquet, - remove_duplicates_from_tsv, - remove_duplicates_from_hdf5, - read_duplicated_ids, -) -from capcruncher.utils import get_file_type - - -def identify( - fragments: tuple, - file_type: str = "auto", - output: os.PathLike = "duplicated_ids.json", - viewpoint: str = "", - buffer: int = 1e6, - read_type: str = "flashed", - n_cores: int = 1, - memory_limit: str = "1G", -): - """ - Identifies aligned fragments with duplicate coordinates. - - Parses a tsv file containing filtered aligned fragments and generates a dictionary containing - the hashed parental read id and hashed genomic coordinates of all slices. Duplicated fragments - are implicitly removed if they share the same genomic coordinate hash. - - For non-combined reads (pe) a genomic coordinate hash is generated from the start of the first slice and - the end of the last slice. This is due to the decreased confidence in the quality of the centre of the fragment. - The coordinate hash for combined reads (flashed) is generated directly from the fragment coordinates. Only - fragments with the exact coordinates and slice order will be considered to be duplicates. - - Identified duplicate fragments are output in json format to be used by the "remove" subcommand. - - \f - Args: - fragments_fn (os.PathLike): Input fragments.tsv file to process. - output (os.PathLike, optional): Output path to output duplicated parental read ids. Defaults to "duplicated_ids.json". - buffer (int, optional): Number of fragments to process in memory. Defaults to 1e6. - read_type (str, optional): Process combined(flashed) or non-combined reads (pe). - Due to the low confidence in the quaility of pe reads, duplicates are identified by - removing any fragments with matching start and end coordinates. - Defaults to "flashed". - """ - - input_file_type = get_file_type(fragments[0]) if file_type == "auto" else file_type - output_file_type = get_file_type(output) - - if input_file_type in ["hdf5", "parquet"]: - # Will use dask for these - cluster = dask.distributed.LocalCluster( - n_workers=n_cores, - threads_per_worker=1, - dashboard_address=None, - processes=True, - scheduler_port=0, - local_directory=os.environ.get("TMPDIR", "/tmp/"), - memory_limit=memory_limit, - ) - - with dask.distributed.Client(cluster) as client: - - # Extract duplicated fragment ids - if input_file_type == "tsv": - if len(fragments) > 1: - raise NotImplementedError("Currently just supports a single tsv input") - else: - duplicated_fragments = identify_coordinate_duplicates_from_tsv( - fragments[0], read_type=read_type, buffer=buffer - ) - elif input_file_type == "hdf5": - outfile = f"{output.replace('.hdf5', '')}.hdf5" - if os.path.exists(outfile): - os.remove(outfile) - - duplicated_fragments = identify_coordinate_duplicates_from_hdf5( - fragments, read_type=read_type - ) - - elif input_file_type == "parquet": - duplicated_fragments = identify_coordinate_duplicates_from_parquet( - fragments, read_type=read_type - ) - - else: - raise ValueError(f"Input file type {file_type} not supported") - - # Output - if output_file_type == "json": - with xopen.xopen(output, "w") as w: - ujson.dump(dict.fromkeys(duplicated_fragments), w) - - elif output_file_type == "hdf5": - - duplicated_fragments.to_hdf( - outfile, f"/duplicated_ids", min_itemsize={"id": 25}, index=False - ) - - elif output_file_type == "pickle": - duplicated_fragments = duplicated_fragments.compute() - duplicated_fragments.to_pickle(output) - - -def remove( - slices: os.PathLike, - duplicated_ids: os.PathLike, - output: os.PathLike = "dedup.slices.tsv.gz", - buffer: int = 5e6, - sample_name: str = "", - read_type: str = "", - stats_prefix: os.PathLike = "", - file_type: str = "hdf5", - n_cores: int = 1, - memory_limit: str = "1G", -): - """ - Removes duplicated aligned fragments. - - Parses a tsv file containing aligned read slices and outputs only slices from unique fragments. - Duplicated parental read id determined by the "identify" subcommand are located within the - slices tsv file and removed. - - Outputs statistics for the number of unique slices and the number of duplicate slices identified. - - \f - Args: - slices_fn (os.PathLike): Input slices.tsv file. - duplicated_ids (os.PathLike): Duplicated parental read ids in json format. - output (os.PathLike, optional): Output file path for deduplicated slices. Defaults to "dedup.slices.tsv.gz". - buffer (int, optional): Number of slices to process in memory. Defaults to 1e6. - sample_name (str, optional): Name of sample being processed e.g. DOX-treated_1 used for statistics. Defaults to "". - read_type (str, optional): Process combined(flashed) or non-combined reads (pe) used for statistics. Defaults to "". - stats_prefix (os.PathLike, optional): Output path for deduplication statistics. Defaults to "". - """ - - input_file_type = ( - get_file_type(slices if isinstance(slices, str) else slices[0]) - if file_type == "auto" - else file_type - ) - output_file_type = get_file_type(output) - - duplicates = read_duplicated_ids(duplicated_ids) - - if input_file_type in ["hdf5", "parquet"]: - # Will use dask for these - cluster = dask.distributed.LocalCluster( - n_workers=n_cores, - threads_per_worker=1, - dashboard_address=None, - processes=True, - scheduler_port=0, - local_directory=os.environ.get("TMPDIR", "/tmp/"), - memory_limit=memory_limit, - ) - with dask.distributed.Client(cluster) as client: - - if input_file_type == "tsv": - - if not output_file_type == "tsv": - raise NotImplementedError("Currently only tsv -> tsv output supported") - - n_reads_total, n_reads_unique = remove_duplicates_from_tsv( - slices, output, duplicates, buffer=buffer - ) - - elif input_file_type == "hdf5": - - if not output_file_type == "hdf5": - raise NotImplementedError("Currently only hdf5 -> hdf5 output supported") - - n_reads_total, n_reads_unique = remove_duplicates_from_hdf5( - slices, duplicates, output - ) - - elif input_file_type == "parquet": - if not output_file_type == "parquet": - raise NotImplementedError( - "Currently only parquet -> parquet output supported" - ) - - n_reads_total, n_reads_unique = remove_duplicates_from_parquet( - slices, duplicates, output - ) - - # Prepare stats - df_stats = pd.DataFrame() - df_stats["stat_type"] = ["not-deduplicated", "deduplicated"] - df_stats["stat"] = [n_reads_total, n_reads_unique] - df_stats["sample"] = sample_name - df_stats["read_type"] = read_type - df_stats["read_number"] = 0 - df_stats["stage"] = "deduplicate_slices" - df_stats.to_csv(f"{stats_prefix}.read.stats.csv", index=False) - - return df_stats diff --git a/capcruncher/cli/alignments_filter.py b/capcruncher/cli/alignments_filter.py index 781d5170..c5a192ee 100644 --- a/capcruncher/cli/alignments_filter.py +++ b/capcruncher/cli/alignments_filter.py @@ -1,11 +1,10 @@ import os -from typing import Literal import pandas as pd -import logging +from loguru import logger -from capcruncher.tools.io import parse_bam -from capcruncher.tools.filter import CCSliceFilter, TriCSliceFilter, TiledCSliceFilter +from capcruncher.api.io import parse_bam +from capcruncher.api.filter import CCSliceFilter, TriCSliceFilter, TiledCSliceFilter SLICE_FILTERS = { "capture": CCSliceFilter, @@ -14,8 +13,7 @@ } -# @get_timing(task_name="merging annotations with BAM input") -def merge_annotations(df: pd.DataFrame, annotations: os.PathLike) -> pd.DataFrame: +def merge_annotations(slices: pd.DataFrame, annotations: os.PathLike) -> pd.DataFrame: """Combines annotations with the parsed bam file output. Uses pandas outer join on the indexes to merge annotations @@ -26,45 +24,29 @@ def merge_annotations(df: pd.DataFrame, annotations: os.PathLike) -> pd.DataFram Args: - df (pd.DataFrame): Dataframe to merge with annotations - annotations (os.PathLike): Filename of .tsv to read and merge with df + slices (pd.DataFrame): Dataframe to merge with annotations + annotations (os.PathLike): Filename of to read and merge with df Returns: pd.DataFrame: Merged dataframe """ - if annotations.endswith(".tsv"): - df_ann = pd.read_csv( - annotations, - sep="\t", - header=0, - index_col=["slice_name", "chrom", "start"], - low_memory=False, - ) - - elif annotations.endswith(".hdf5"): - df_ann = pd.read_hdf(annotations, "annotation").set_index( - ["slice_name", "chrom", "start"] - ) - elif annotations.endswith(".parquet"): - df_ann = pd.read_parquet(annotations).set_index( - ["slice_name", "chrom", "start"] - ) - - df_ann = df_ann.drop(columns="end", errors="ignore") - - df = df.join(df_ann, how="inner").reset_index() + df_ann = ( + pd.read_parquet(annotations) + .rename(columns={"Chromosome": "chrom", "Start": "start", "End": "end"}) + .set_index(["slice_name", "chrom", "start"]) + .drop(columns="end", errors="ignore") + ) + slices = slices.join(df_ann, how="inner").reset_index() - return df + return slices -# @get_timing(task_name="analysis of bam file") def filter( bam: os.PathLike, annotations: os.PathLike, custom_filtering: os.PathLike = None, output_prefix: os.PathLike = "reporters", - output_format: Literal["tsv", "hdf5", "parquet"] = "parquet", stats_prefix: os.PathLike = "", method: str = "capture", sample_name: str = "", @@ -127,103 +109,86 @@ def filter( cis_and_trans_stats (bool, optional): Enables cis/trans statistics to be output. Defaults to True. """ - # Read bam file and merege annotations - logging.info("Loading bam file") - df_alignment = parse_bam(bam) - logging.info("Merging bam file with annotations") - df_alignment = merge_annotations(df_alignment, annotations) - - # Initialise SliceFilter - # If no custom filtering, will use the class default. - slice_filter_class = SLICE_FILTERS[method] - slice_filter = slice_filter_class( - slices=df_alignment, - sample_name=sample_name, - read_type=read_type, - filter_stages=custom_filtering, - ) - - # Filter slices using the slice_filter - logging.info(f"Filtering slices with method: {method}") - slice_filter.filter_slices() - - if slice_stats: - slice_filter.filter_stats.to_csv(f"{stats_prefix}.slice.stats.csv", index=False) - - if read_stats: - slice_filter.read_stats.to_csv(f"{stats_prefix}.read.stats.csv", index=False) - - # Save reporter stats - if cis_and_trans_stats: - logging.info(f"Writing reporter statistics") - slice_filter.cis_or_trans_stats.to_csv( - f"{stats_prefix}.reporter.stats.csv", index=False + with logger.catch(): + # Read bam file and merege annotations + logger.info("Loading bam file") + df_alignment = parse_bam(bam) + logger.info("Merging bam file with annotations") + df_alignment = merge_annotations(df_alignment, annotations) + + if "blacklist" not in df_alignment.columns: + df_alignment["blacklist"] = 0 + + # Initialise SliceFilter + # If no custom filtering, will use the class default. + slice_filter_class = SLICE_FILTERS[method] + slice_filter = slice_filter_class( + slices=df_alignment, + sample_name=sample_name, + read_type=read_type, + filter_stages=custom_filtering, ) - # Output slices filtered by viewpoint + # Filter slices using the slice_filter + logger.info(f"Filtering slices with method: {method}") + slice_filter.filter_slices() + + if slice_stats: + slice_stats_path = f"{stats_prefix}.slice.stats.csv" + logger.info(f"Writing slice statistics to {slice_stats_path}") + slice_filter.filter_stats.to_csv(slice_stats_path, index=False) + + if read_stats: + read_stats_path = f"{stats_prefix}.read.stats.csv" + logger.info(f"Writing read statistics to {read_stats_path}") + slice_filter.read_stats.to_csv(read_stats_path, index=False) + + # Save reporter stats + if cis_and_trans_stats: + logger.info("Writing reporter statistics") + slice_filter.cis_or_trans_stats.to_csv( + f"{stats_prefix}.reporter.stats.csv", index=False + ) - df_slices = slice_filter.slices - df_slices_with_viewpoint = slice_filter.slices_with_viewpoint - df_capture = slice_filter.captures + # Output slices filtered by viewpoint - if fragments: - logging.info(f"Writing reporters at the fragment level") - df_fragments = ( - slice_filter_class(df_slices) - .fragments.join( - df_capture["capture"], lsuffix="_slices", rsuffix="_capture" - ) - .rename( - columns={"capture_slices": "capture", "capture_capture": "viewpoint"} + df_slices = slice_filter.slices + df_slices_with_viewpoint = slice_filter.slices_with_viewpoint + df_capture = slice_filter.captures + + if fragments: + logger.info("Writing reporters at the fragment level") + df_fragments = ( + slice_filter_class(df_slices) + .fragments.join( + df_capture["capture"], lsuffix="_slices", rsuffix="_capture" + ) + .rename( + columns={ + "capture_slices": "capture", + "capture_capture": "viewpoint", + } + ) + .assign(id=lambda df: df["id"].astype("int64")) # Enforce type ) - .assign(id=lambda df: df["id"].astype("int64")) # Enforce type - ) - if output_format == "tsv": - df_fragments.to_csv(f"{output_prefix}.fragments.tsv", sep="\t", index=False) - elif output_format == "hdf5": - df_fragments.to_hdf( - f"{output_prefix}.slices.hdf5", - key="fragments", - data_columns=["viewpoint"], - format="table", - complib="blosc", - complevel=2, + df_fragments.to_parquet( + f"{output_prefix}.fragments.parquet", + compression="snappy", + engine="pyarrow", ) - elif output_format == "parquet": - if not df_fragments.empty: - df_fragments.to_parquet( - f"{output_prefix}.fragments.parquet", - compression="snappy", - engine="pyarrow", - ) - logging.info(f"Writing reporters slices") - # Enforce dtype for parent_id - df_slices_with_viewpoint = df_slices_with_viewpoint.assign( - parent_id=lambda df: df["parent_id"].astype("int64") - ) + logger.info("Writing reporters slices") - if output_format == "tsv": - df_slices_with_viewpoint.to_csv( - f"{output_prefix}.slices.tsv", sep="\t", index=False - ) - elif output_format == "hdf5": - df_slices_with_viewpoint.to_hdf( - f"{output_prefix}.slices.hdf5", - key="slices", - data_columns=["viewpoint"], - format="table", - complib="blosc", - complevel=2, + # Enforce dtype for parent_id + df_slices_with_viewpoint = df_slices_with_viewpoint.assign( + parent_id=lambda df: df["parent_id"].astype("int64") ) - elif output_format == "parquet": - if not df_slices_with_viewpoint.empty: - df_slices_with_viewpoint.to_parquet( - f"{output_prefix}.slices.parquet", - compression="snappy", - engine="pyarrow", - ) + df_slices_with_viewpoint.to_parquet( + f"{output_prefix}.slices.parquet", + compression="snappy", + engine="pyarrow", + ) - logging.info(f"Completed analysis of bam file") + logger.info("Completed analysis of BAM file") diff --git a/capcruncher/cli/cli_alignments.py b/capcruncher/cli/cli_alignments.py index 86547fd5..15b9b7c8 100644 --- a/capcruncher/cli/cli_alignments.py +++ b/capcruncher/cli/cli_alignments.py @@ -49,7 +49,7 @@ def cli(): "-o", "--output", help="Path for the annotated slices to be output.", - default="annotated.slices.tsv.gz", + default="annotated.slices.parquet", ) @click.option( "--duplicates", @@ -160,12 +160,6 @@ def annotate(*args, **kwargs): help="Determines cis/trans statistics are output", default=True, ) -@click.option( - "--output-format", - help="Determines file output format", - default="parquet", - type=click.Choice(["tsv", "hdf5", "parquet"]), -) def filter(*args, **kwargs): """ Removes unwanted aligned slices and identifies reporters. @@ -179,129 +173,114 @@ def filter(*args, **kwargs): filter(*args, **kwargs) -@cli.group() -def deduplicate(): - """ - Identifies and removes duplicated aligned fragments. - - PCR duplicates are very commonly present in Capture-C/Tri-C/Tiled-C data and must be removed - for accurate analysis. Unlike fastq deduplicate, this command removes fragments with identical - genomic coordinates. - - Non-combined (pe) and combined (flashed) reads are treated slightly differently due to the increased - confidence that the ligation junction has been captured for the flashed reads. - - """ - - -@deduplicate.command() -@click.argument("fragments", nargs=-1, required=True) -@click.option( - "-v", - "--viewpoint", - help="Viewpoint to process, leave blank for all viewpoints", - default="", -) -@click.option( - "-t", - "--file-type", - help="File format for input", - default="auto", - type=click.Choice(["auto", "tsv", "hdf5", "parquet"], case_sensitive=False), -) -@click.option( - "-o", - "--output", - help="Path for outputting fragments with duplicated coordinates in json format.", - default="duplicated_ids.pickle", -) -@click.option( - "--buffer", - help="Number of fragments to process at one time in order to preserve memory.", - default=1e6, - type=click.INT, -) -@click.option( - "--read-type", - help="Indicates if the fragments have been combined (flashed) or not (pe).", - default="flashed", - type=click.Choice(["flashed", "pe"], case_sensitive=False), -) -@click.option( - "-p", - "--n_cores", - help="Number of parallel processes to use for deduplication", - default=1, -) -@click.option( - "--memory-limit", - help="Maximum amount of memory to use.", - default="1G", -) -def identify(*args, **kwargs): - from capcruncher.cli.alignments_deduplicate import identify +# @deduplicate.command() +# @click.argument("fragments", nargs=-1, required=True) +# @click.option( +# "-v", +# "--viewpoint", +# help="Viewpoint to process, leave blank for all viewpoints", +# default="", +# ) +# @click.option( +# "-t", +# "--file-type", +# help="File format for input", +# default="auto", +# type=click.Choice(["auto", "tsv", "hdf5", "parquet"], case_sensitive=False), +# ) +# @click.option( +# "-o", +# "--output", +# help="Path for outputting fragments with duplicated coordinates in json format.", +# default="duplicated_ids.pickle", +# ) +# @click.option( +# "--buffer", +# help="Number of fragments to process at one time in order to preserve memory.", +# default=1e6, +# type=click.INT, +# ) +# @click.option( +# "--read-type", +# help="Indicates if the fragments have been combined (flashed) or not (pe).", +# default="flashed", +# type=click.Choice(["flashed", "pe"], case_sensitive=False), +# ) +# @click.option( +# "-p", +# "--n_cores", +# help="Number of parallel processes to use for deduplication", +# default=1, +# ) +# @click.option( +# "--memory-limit", +# help="Maximum amount of memory to use.", +# default="1G", +# ) +# def identify(*args, **kwargs): +# from capcruncher.cli.alignments_deduplicate import identify - identify(*args, **kwargs) +# identify(*args, **kwargs) -@deduplicate.command() -@click.argument("slices", nargs=-1, required=True) -@click.option( - "-d", - "--duplicated_ids", - help="Path to duplicated fragment ids determined by the 'identify' subcommand.", -) -@click.option( - "-o", - "--output", - help="Path for outputting deduplicated slices in tsv format.", - default="slices_dedup.hdf5", -) -@click.option( - "-t", - "--file-type", - help="File format for input", - default="auto", - type=click.Choice(["auto", "tsv", "hdf5", "parquet"], case_sensitive=False), -) -@click.option( - "--buffer", - help="Number of fragments to process at one time, in order to preserve memory.", - default=1e6, - type=click.INT, -) -@click.option("--stats-prefix", help="Output prefix for deduplication statistics") -@click.option( - "--sample-name", - help="Name of sample being analysed e.g. DOX_treated_1. Required for correct statistics.", -) -@click.option( - "--read-type", - help="Indicates if the fragments have been combined (flashed) or not (pe). Required for correct statistics.", - default="flashed", - type=click.Choice(["flashed", "pe"], case_sensitive=False), -) -@click.option( - "-p", - "--n_cores", - help="Number of parallel processes to use for deduplication", - default=1, -) -@click.option( - "--memory-limit", - help="Maximum amount of memory to use.", - default="1G", -) -def remove(*args, **kwargs): - """ - Removes duplicated aligned fragments. +# @deduplicate.command() +# @click.argument("slices", nargs=-1, required=True) +# @click.option( +# "-d", +# "--duplicated_ids", +# help="Path to duplicated fragment ids determined by the 'identify' subcommand.", +# ) +# @click.option( +# "-o", +# "--output", +# help="Path for outputting deduplicated slices in tsv format.", +# default="slices_dedup.hdf5", +# ) +# @click.option( +# "-t", +# "--file-type", +# help="File format for input", +# default="auto", +# type=click.Choice(["auto", "tsv", "hdf5", "parquet"], case_sensitive=False), +# ) +# @click.option( +# "--buffer", +# help="Number of fragments to process at one time, in order to preserve memory.", +# default=1e6, +# type=click.INT, +# ) +# @click.option("--stats-prefix", help="Output prefix for deduplication statistics") +# @click.option( +# "--sample-name", +# help="Name of sample being analysed e.g. DOX_treated_1. Required for correct statistics.", +# ) +# @click.option( +# "--read-type", +# help="Indicates if the fragments have been combined (flashed) or not (pe). Required for correct statistics.", +# default="flashed", +# type=click.Choice(["flashed", "pe"], case_sensitive=False), +# ) +# @click.option( +# "-p", +# "--n_cores", +# help="Number of parallel processes to use for deduplication", +# default=1, +# ) +# @click.option( +# "--memory-limit", +# help="Maximum amount of memory to use.", +# default="1G", +# ) +# def remove(*args, **kwargs): +# """ +# Removes duplicated aligned fragments. - Parses a tsv file containing aligned read slices and outputs only slices from unique fragments. - Duplicated parental read id determined by the "identify" subcommand are located within the - slices tsv file and removed. +# Parses a tsv file containing aligned read slices and outputs only slices from unique fragments. +# Duplicated parental read id determined by the "identify" subcommand are located within the +# slices tsv file and removed. - Outputs statistics for the number of unique slices and the number of duplicate slices identified. - """ - from capcruncher.cli.alignments_deduplicate import remove +# Outputs statistics for the number of unique slices and the number of duplicate slices identified. +# """ +# from capcruncher.cli.alignments_deduplicate import remove - remove(*args, **kwargs) +# remove(*args, **kwargs) diff --git a/capcruncher/cli/cli_fastq.py b/capcruncher/cli/cli_fastq.py index 6b6a4beb..e27811ec 100644 --- a/capcruncher/cli/cli_fastq.py +++ b/capcruncher/cli/cli_fastq.py @@ -1,7 +1,7 @@ -from os import name import click from capcruncher.cli import UnsortedGroup + @click.group() def cli(): """Contains methods for fastq splitting, deduplicating and digestion.""" @@ -39,6 +39,12 @@ def cli(): "--gzip/--no-gzip", help="Determines if files are gziped or not", default=False ) @click.option("-p", "--n_cores", default=1, type=click.INT) +@click.option( + "-s", + "--suffix", + help="Suffix to add to output files (ignore {read_number}.fastq as this is added automatically)", + default="", +) def split(*args, **kwargs): """ Splits fastq file(s) into equal chunks of n reads. @@ -46,11 +52,18 @@ def split(*args, **kwargs): """ from capcruncher.cli.fastq_split import split + split(*args, **kwargs) + @cli.command() @click.argument("input_fastq", nargs=-1, required=True) -@click.option("-r", "--restriction_enzyme", help="Restriction enzyme name or sequence to use for in silico digestion.", required=True) +@click.option( + "-r", + "--restriction_enzyme", + help="Restriction enzyme name or sequence to use for in silico digestion.", + required=True, +) @click.option( "-m", "--mode", @@ -76,13 +89,16 @@ def split(*args, **kwargs): ) @click.option("--stats-prefix", help="Output prefix for stats file", default="stats") @click.option( - "--sample-name", help="Name of sample e.g. DOX_treated_1. Required for correct statistics.", default="sampleX" + "--sample-name", + help="Name of sample e.g. DOX_treated_1. Required for correct statistics.", + default="sampleX", ) def digest(*args, **kwargs): """ Performs in silico digestion of one or a pair of fastq files. """ from capcruncher.cli.fastq_digest import digest + digest(*args, **kwargs) @@ -97,7 +113,8 @@ def deduplicate(): """ -@deduplicate.command(name='parse') + +@deduplicate.command(name="parse") @click.argument("input_files", nargs=-1, required=True) @click.option( "-o", @@ -118,15 +135,16 @@ def deduplicate_parse(*args, **kwargs): This command parses one or more fastq files and generates a dictionary containing hashed read identifiers together with hashed concatenated sequences. The hash dictionary - is output in json format and the identify subcommand can be used to determine which read identifiers + is output in json format and the identify subcommand can be used to determine which read identifiers have duplicate sequences. """ from capcruncher.cli.fastq_deduplicate import parse + parse(*args, **kwargs) -@deduplicate.command(name='identify') +@deduplicate.command(name="identify") @click.argument( "input_files", nargs=-1, @@ -139,7 +157,7 @@ def deduplicate_identify(*args, **kwargs): """ Identifies fragments with duplicated sequences. - Merges the hashed dictionaries (in json format) generated by the "parse" subcommand and + Merges the hashed dictionaries (in json format) generated by the "parse" subcommand and identifies read with exactly the same sequence (share an identical hash). Duplicated read identifiers (hashed) are output in json format. The "remove" subcommand uses this dictionary to remove duplicates from fastq files. @@ -147,10 +165,11 @@ def deduplicate_identify(*args, **kwargs): """ from capcruncher.cli.fastq_deduplicate import identify + identify(*args, **kwargs) - -@deduplicate.command(name='remove') + +@deduplicate.command(name="remove") @click.argument("input_files", nargs=-1) @click.option( "-o", @@ -170,9 +189,7 @@ def deduplicate_identify(*args, **kwargs): type=click.INT, ) @click.option( - "--gzip/--no-gzip", - help="Determines if files are gziped or not", - default=False + "--gzip/--no-gzip", help="Determines if files are gziped or not", default=False ) @click.option( "--compression_level", @@ -180,21 +197,26 @@ def deduplicate_identify(*args, **kwargs): default=5, type=click.INT, ) -@click.option("--sample-name", help="Name of sample e.g. DOX_treated_1", default='sampleX') -@click.option("--stats-prefix", help="Output prefix for stats file", default='stats') -@click.option("--hash-read-name/--no-hash-read-name", help="Hashes the read id to save memory", default=False) +@click.option( + "--sample-name", help="Name of sample e.g. DOX_treated_1", default="sampleX" +) +@click.option("--stats-prefix", help="Output prefix for stats file", default="stats") +@click.option( + "--hash-read-name/--no-hash-read-name", + help="Hashes the read id to save memory", + default=False, +) @click.option("-p", "--n_cores", default=1, type=click.INT) def deduplicate_remove(*args, **kwargs): """ Removes fragments with duplicated sequences from fastq files. - + Parses input fastq files and removes any duplicates from the fastq file(s) that are - present in the json file supplied. This json dictionary should be produced by the - "identify" subcommand. + present in the json file supplied. This json dictionary should be produced by the + "identify" subcommand. Statistics for the number of duplicated and unique reads are also provided. """ from capcruncher.cli.fastq_deduplicate import remove - remove(*args, **kwargs) - + remove(*args, **kwargs) diff --git a/capcruncher/cli/cli_genome.py b/capcruncher/cli/cli_genome.py index 88ba761a..91ef621c 100644 --- a/capcruncher/cli/cli_genome.py +++ b/capcruncher/cli/cli_genome.py @@ -42,4 +42,4 @@ def digest(*args, **kwargs): """ from capcruncher.cli.genome_digest import digest - digest(*args, **kwargs) \ No newline at end of file + digest(*args, **kwargs) diff --git a/capcruncher/cli/cli_reporters.py b/capcruncher/cli/cli_interactions.py similarity index 71% rename from capcruncher/cli/cli_reporters.py rename to capcruncher/cli/cli_interactions.py index 919b7045..4b5a5e30 100644 --- a/capcruncher/cli/cli_reporters.py +++ b/capcruncher/cli/cli_interactions.py @@ -1,5 +1,4 @@ import click -from numpy import multiply @click.group() @@ -8,76 +7,43 @@ def cli(): @cli.command() -@click.argument("union_bedgraph") -@click.option( - "-n", - "--capture_name", - help="Name of capture probe, must be present in viewpoint file.", - required=True, -) -@click.option( - "-c", - "--capture_viewpoints", - help="Path to capture viewpoints bed file", - required=True, -) +@click.argument("slices", required=True) @click.option( "-o", - "--output_prefix", - help="Output prefix for pairwise statistical comparisons", - default="out", -) -@click.option( - "--design_matrix", - help="Path tsv file containing sample annotations (N_SAMPLES * N_INFO_COLUMNS)", - default=None, -) -@click.option( - "--grouping_col", help="Column to use for grouping replicates", default="condition" + "--output", + help="Output prefix for directory of deduplicated slices", + default="deduplicated_slices/", ) @click.option( - "--threshold_count", - help="Minimum count required to be considered for analysis", - default=20, - type=click.FLOAT, + "--stats-prefix", + help="Output prefix for stats file(s)", + default="", ) @click.option( - "--threshold_q", - help="Upper threshold of q-value required for output.", - default=0.05, - type=click.FLOAT, + "--sample-name", help="Name of sample e.g. DOX_treated_1", default="sample" ) @click.option( - "--threshold_mean", - help="Minimum mean count required for output.", - default=0, - type=click.FLOAT, + "--read-type", + help="Type of read", + default="flashed", + type=click.Choice(["flashed", "pe"], case_sensitive=False), ) -def differential(*args, **kwargs): +def deduplicate(*args, **kwargs): """ - Identifies differential interactions between conditions. + Identifies and removes duplicated aligned fragments. - Parses a union bedgraph containg reporter counts from at least two conditions with - two or more replicates for a single capture probe and outputs differential interaction - results. Following filtering to ensure that the number of interactions is above the required - threshold (--threshold_count), diffxpy is used to run a wald test after - fitting a negative binomial model to the interaction counts.The options to filter - results can be filtered by a minimum mean value (threshold_mean) and/or - maximum q-value (threshold-q) are also provided. + PCR duplicates are very commonly present in Capture-C/Tri-C/Tiled-C data and must be removed + for accurate analysis. Unlike fastq deduplicate, this command removes fragments with identical + genomic coordinates. - Notes: + Non-combined (pe) and combined (flashed) reads are treated slightly differently due to the increased + confidence that the ligation junction has been captured for the flashed reads. - Currently both the capture viewpoints and the name of the probe being analysed must - be provided in order to correctly extract cis interactions. - - If a N_SAMPLE * METADATA design matrix has not been supplied, the script - assumes that the standard replicate naming structure has been followed - i.e. SAMPLE_CONDITION_REPLICATE_(1|2).fastq.gz. """ - from capcruncher.cli.reporters_differential import differential + from capcruncher.cli.interactions_deduplicate import deduplicate - differential(*args, **kwargs) + deduplicate(*args, **kwargs) @cli.command() @@ -108,7 +74,7 @@ def differential(*args, **kwargs): ) @click.option("--gzip", help="Compress output using gzip", default=False, is_flag=True) @click.option( - "--scale_factor", + "--scale-factor", help="Scale factor to use for bedgraph normalisation", default=1e6, type=click.INT, @@ -136,7 +102,7 @@ def pileup(*args, **kwargs): inter experiment comparisons and/or extract pilups binned into even genomic windows. """ - from capcruncher.cli.reporters_pileup import pileup + from capcruncher.cli.interactions_pileup import pileup pileup(*args, **kwargs) @@ -213,33 +179,34 @@ def count(*args, **kwargs): Options to ignore unwanted counts e.g. excluded regions or capture fragments are provided. In addition the number of reporter fragments can be subsampled if required. """ - + if kwargs.get("output_as_cooler"): if not kwargs.get("fragment_map"): - raise ValueError("Restriction fragment map must be provided for cooler output") + raise ValueError( + "Restriction fragment map must be provided for cooler output" + ) elif not kwargs.get("viewpoint_path"): raise ValueError("Viewpoint path must be provided for cooler output") - - from capcruncher.cli.reporters_count import count + from capcruncher.cli.interactions_count import count count(*args, **kwargs) -@cli.group() -def store(): - """ - Store reporter counts. +# @cli.group() +# def store(): +# """ +# Store reporter counts. - These commands store and manipulate reporter restriction fragment interaction - counts as cooler formated groups in HDF5 files. +# These commands store and manipulate reporter restriction fragment interaction +# counts as cooler formated groups in HDF5 files. - See subcommands for details. +# See subcommands for details. - """ +# """ -@store.command(name="fragments") +@cli.command(name="counts-to-cooler") @click.argument("counts", required=True) @click.option( "-f", @@ -282,12 +249,12 @@ def store_fragments(*args, **kwargs): "capcruncher reporters count" and gerates a cooler formatted group in an HDF5 File. See `https://cooler.readthedocs.io/en/latest/` for further details. """ - from capcruncher.cli.reporters_store import fragments + from capcruncher.cli.interactions_store import fragments fragments(*args, **kwargs) -@store.command(name="bins") +@cli.command(name="fragments-to-bins") @click.argument("cooler_path", required=True) @click.option( "-b", @@ -315,7 +282,7 @@ def store_fragments(*args, **kwargs): type=click.INT, ) @click.option( - "--scale_factor", + "--scale-factor", help="Scaling factor used for normalisation", default=1e6, type=click.INT, @@ -339,34 +306,38 @@ def store_bins(*args, **kwargs): genomic bins of a specified size. If the normalise option is selected, columns containing normalised counts are added to the pixels table of the output """ - from capcruncher.cli.reporters_store import bins + from capcruncher.cli.interactions_store import bins bins(*args, **kwargs) -@store.command(name="merge") +@cli.command(name="merge") @click.argument("coolers", required=True, nargs=-1) @click.option("-o", "--output", help="Output file name") def store_merge(*args, **kwargs): """ - Merges capcruncher cooler files together. + Merges capcruncher HDF5 files together. Produces a unified cooler with both restriction fragment and genomic bins whilst reducing the storage space required by hard linking the "bins" tables to prevent duplication. """ - from capcruncher.cli.reporters_store import merge + from capcruncher.api.storage import merge_coolers - merge(*args, **kwargs) + merge_coolers(*args, **kwargs) @cli.group() def compare(): - """ - Compare bedgraphs and CapCruncher cooler files. + r"""Compare bedgraphs and CapCruncher cooler files. + + These commands allow for specific viewpoints to be extracted from CapCruncher HDF5 files and perform: + + 1. User defined groupby aggregations. - These commands allow for specific viewpoints to be extracted from cooler files - and perform user defined groupby aggregations. + 2. Comparisons between conditions. + + 3. Identification of differential interactions between conditions. See subcommands for details. @@ -411,7 +382,7 @@ def compare(): ) def bedgraphs_concat(*args, **kwargs): - from capcruncher.cli.reporters_compare import concat + from capcruncher.cli.interactions_compare import concat concat(*args, **kwargs) @@ -441,6 +412,49 @@ def bedgraphs_concat(*args, **kwargs): @click.option("--suffix", help="Add a suffix before the file extension") def bedgraphs_summarise(*args, **kwargs): - from capcruncher.cli.reporters_compare import summarise + from capcruncher.cli.interactions_compare import summarise summarise(*args, **kwargs) + + +@compare.command(name="differential") +@click.argument("interaction_files", required=True, nargs=-1) +@click.option( + "-o", "--output-prefix", help="Output file prefix", default="differential" +) +@click.option("-v", "--viewpoint", help="Viewpoint to extract", required=True) +@click.option("-d", "--design-matrix", help="Design matrix file", required=True) +@click.option("-c", "--contrast", help="Contrast to test", default="condition") +@click.option( + "-r", + "--regions-of-interest", + help="Regions of interest to test for differential interactions", + default=None, +) +@click.option( + "--viewpoint-distance", + help="Distance from viewpoint to test for differential interactions", + default=None, + type=click.INT, +) +@click.option( + "--threshold-count", + help="Minimum number of interactions to test for differential interactions", + default=20, +) +@click.option( + "--threshold-q", + help="Minimum q-value to test for differential interactions", + default=0.05, +) +def bedgraphs_differential(*args, **kwargs): + """Perform differential testing on CapCruncher HDF5 files. + + This command performs differential testing on CapCruncher HDF5 files. It requires a design matrix + and a contrast to test. The design matrix should be a tab separated file with the first column + containing the sample names and the remaining columns containing the conditions. The contrast + should specify the name of the column in the design matrix to test. The output is a tab separated bedgraph. + """ + from capcruncher.cli.interactions_differential import differential + + differential(*args, **kwargs) diff --git a/capcruncher/cli/cli_pipeline.py b/capcruncher/cli/cli_pipeline.py index 28f812be..a226c1e5 100644 --- a/capcruncher/cli/cli_pipeline.py +++ b/capcruncher/cli/cli_pipeline.py @@ -1,66 +1,98 @@ import os from capcruncher.cli import cli import click -from importlib import import_module, metadata +from importlib import metadata import subprocess import sys -import logging +import pathlib -@cli.command(context_settings=dict(ignore_unknown_options=True)) + +@cli.command(context_settings=dict(ignore_unknown_options=True), name="pipeline") @click.option("-h", "--help", "show_help", is_flag=True) @click.option("--version", "show_version", is_flag=True) -@click.version_option(metadata.version(distribution_name="capcruncher")) -@click.argument( - "mode", - type=click.Choice(["make", "run", "plot", "show", "clone", "touch", "report"]), +@click.option( + "--logo/--no-logo", + default=True, + help="Show the capcruncher logo", + show_default=True, ) +@click.version_option(metadata.version(distribution_name="capcruncher")) @click.argument("pipeline_options", nargs=-1, type=click.UNPROCESSED) -@click.option("-s", "--pipeline-statistics-path", help="Path to capcruncher_statistics directory", default='capcruncher_statistics') -@click.option("-o", "--pipeline-report-path", help="Output path for CapCruncher report", default='capcruncher_report.html') -def pipeline(mode, pipeline_options, show_help=False, show_version=False, **report_options): - +def pipeline(pipeline_options, show_help=False, show_version=False, logo=True): """Runs the data processing pipeline""" - fn = os.path.abspath(__file__) - dir_cli = os.path.dirname(fn) - dir_package = os.path.dirname(dir_cli) + fn = pathlib.Path(__file__).resolve() + dir_cli = fn.parent + dir_package = dir_cli.parent - if mode in ["make", "run", "plot", "show", "clone", "touch"]: + cmd = [ + "snakemake", + "-s", + str(dir_package / "pipeline/workflow/Snakefile"), + ] - cmd = [ - "python", - f"{dir_package}/pipeline/pipeline.py", - ] + if show_help: + # Run snakemake with --help + # Capture the output and replace usage: snakemake with usage: capcruncher pipeline + # Print the output + cmd.append("--help") + _completed = subprocess.run(cmd, capture_output=True, shell=False) + output = _completed.stdout.decode("utf-8") + output = output.replace("usage: snakemake", "usage: capcruncher pipeline") + click.echo(f"\n{output}") + sys.exit(0) - if show_help: - cmd.append("--help") - subprocess.run(cmd) - sys.exit() + if pipeline_options: + excluded_options = ["--version", "make", "run", "show"] - cmd.append(mode.replace("run", "make")) + cmd.extend( + [option for option in pipeline_options if option not in excluded_options] + ) - if pipeline_options: - cmd.extend(pipeline_options) + # Implicitly deal with a missing --cores option + if "--cores" not in pipeline_options and "-c" not in pipeline_options: + cmd.append("--cores 1") - # Implicitly deal with the missing --local option - if ( - not os.path.exists(os.environ.get("DRMAA_LIBRARY_PATH", "")) - and not "--local" in pipeline_options - ): - logging.warning( - "DRMAA_LIBRARY_PATH is incorrect. Implicitly using --local with 4 cores" - ) - cmd.append("--local") - cmd.append("-p 4") + if logo: + with open(dir_package / "data" / "logo.txt", "r") as f: + click.echo(f.read()) - completed = subprocess.run(cmd) - - if not completed.returncode == 0: - raise RuntimeError( - "CapCruncher pipeline failed. Check pipeline.log for details" - ) + _completed = subprocess.run(cmd) + if _completed.returncode != 0: + sys.exit(_completed.returncode) else: - from capcruncher.pipeline.make_report import generate_report - generate_report(**report_options) - \ No newline at end of file + # Touch all files to correct timestamps + subprocess.run( + [ + "snakemake", + "-s", + str(dir_package / "pipeline/workflow/Snakefile"), + "--touch", + "--cores", + "1", + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + +@cli.command(name="pipeline-config") +@click.option("-h", "--help", "show_help", is_flag=True) +@click.option("--version", "show_version", is_flag=True) +@click.version_option(metadata.version(distribution_name="capcruncher")) +@click.option( + "-i", "--input", "input_files", type=click.Path(exists=True), multiple=True +) +@click.option("--generate-design", is_flag=True) +def pipeline_config(*args, **kwargs): + """Configures the data processing pipeline""" + + from cookiecutter.main import cookiecutter + import pathlib + + fn = pathlib.Path(__file__).resolve() + dir_cli = fn.parent + dir_package = dir_cli.parent + + cookiecutter(str(dir_package / "pipeline" / "config")) diff --git a/capcruncher/cli/cli_plot.py b/capcruncher/cli/cli_plot.py index 6695ecbd..742fc615 100644 --- a/capcruncher/cli/cli_plot.py +++ b/capcruncher/cli/cli_plot.py @@ -1,4 +1,5 @@ -import click +import click + @click.group() def cli(): @@ -8,29 +9,42 @@ def cli(): See subcommands for details """ + @cli.command() @click.argument("files", nargs=-1) -@click.option("-o", "--output_prefix", default="template", help='Output prefix for template file') -@click.option("-d", "--design_matrix", help='TSV file with the columns: sample condition') -@click.option("-v", "--viewpoint", help='Sets the template viewpoint') -@click.option("-b", "--binsize", help='Sets the template binsize') +@click.option( + "-o", "--output_prefix", default="template", help="Output prefix for template file" +) +@click.option( + "-d", "--design_matrix", help="TSV file with the columns: sample condition" +) +@click.option("-v", "--viewpoint", help="Sets the template viewpoint") +@click.option("-b", "--binsize", help="Sets the template binsize") def make_template(*args, **kwargs): """ Generates a template for the supplied files. This can be edited to customise the plot. """ from capcruncher.cli.plot import make_template + make_template(*args, **kwargs) + @cli.command() -@click.option('-r', '--region', required=True, help='Genomic coordinates of the region to plot') -@click.option("-c", "--config", required=True, help='Configuration file generated by the make_template command') -@click.option("-o", "--output", default='capcruncher_plot.png') -@click.option("--x-axis", is_flag=True) -@click.option("--no-scale-bar", is_flag=True) +@click.option( + "-r", "--region", required=True, help="Genomic coordinates of the region to plot" +) +@click.option( + "-t", + "--template", + required=True, + help="TOML file containing the template for the plot", +) +@click.option("-o", "--output", default="capcruncher_plot.png") def make_plot(*args, **kwargs): """ Generates a plot for the genomic region specified using the suplied configuration file. """ - from capcruncher.cli.plot import plot_reporters - plot_reporters(*args, **kwargs) + from capcruncher.cli.plot import plot + + plot(*args, **kwargs) diff --git a/capcruncher/cli/cli_utilities.py b/capcruncher/cli/cli_utilities.py index c4597776..32f7c9cd 100644 --- a/capcruncher/cli/cli_utilities.py +++ b/capcruncher/cli/cli_utilities.py @@ -1,34 +1,13 @@ -from email.policy import default import subprocess from tempfile import NamedTemporaryFile from typing import Iterable, List, Literal import click -from capcruncher.cli import UnsortedGroup -import ast import pandas as pd -from cgatcore.iotools import touch_file import os -import logging -import glob +from loguru import logger from capcruncher.utils import get_file_type - - -def strip_cmdline_args(args): - - formatted_args = dict() - for arg in args: - key, value = arg.split("=") - - if "\\t" in value: - formatted_args[key] = "\t" - else: - - try: - formatted_args[key] = ast.literal_eval(value) - except SyntaxError: - formatted_args[key] = value - - return formatted_args +import ibis +from ibis import _ @click.group() @@ -56,138 +35,103 @@ def gtf_to_bed12(gtf: str, output: str): w.write(gtf_line_to_bed12_line(df) + "\n") -@cli.command() -@click.argument("infiles", nargs=-1, required=True) -@click.option("-s", "--partition-size", default="2GB") -@click.option("-o", "--out-glob", required=True) -@click.option("-r", "--read-args", multiple=True) -@click.option("-w", "--write-args", multiple=True) -def repartition_csvs( - infiles: Iterable, - out_glob: str, - partition_size: str, - read_args: tuple = None, - write_args: tuple = None, -): - - import dask.dataframe as dd - - read_args = strip_cmdline_args(read_args) - write_args = strip_cmdline_args(write_args) - - ( - dd.read_csv(infiles, **read_args) - .repartition(partition_size=partition_size) - .to_csv(out_glob, **write_args) - ) - - @cli.command() @click.argument("slices") @click.option("-o", "--output", help="Output file name") -@click.option("-m", "--method", type=click.Choice(["capture", "tri", "tiled"])) -@click.option("--file-type", type=click.Choice(["parquet", "hdf5", "tsv"])) @click.option("--sample-name", help="Name of sample e.g. DOX_treated_1") @click.option( - "--read-type", - help="Type of read", - default="flashed", - type=click.Choice(["flashed", "pe"], case_sensitive=False), -) -@click.option( - "-p", - "--n_cores", - help="Number of parallel processes to use", - default=1, -) -@click.option( - "--memory-limit", - help="Maximum amount of memory to use.", - default="1G", + "--assay", + help="Assay used to generate slices", + type=click.Choice(["capture", "tri", "tiled"]), ) def cis_and_trans_stats( slices: str, output: str, - method: Literal["capture", "tri", "tiled"], - file_type: str = "hdf5", - sample_name: str = "", - read_type: str = "", - n_cores: int = 1, - memory_limit: str = "1G", + sample_name: str, + assay: Literal["capture", "tri", "tiled"] = "capture", ): + con = ibis.duckdb.connect() + + if not os.path.isdir(slices): + tbl = con.register(f"parquet://{slices}", table_name="slices_tbl") + else: + tbl = con.register(f"parquet://{slices}/*.parquet", table_name="slices_tbl") - from capcruncher.tools.filter import ( - CCSliceFilter, - TriCSliceFilter, - TiledCSliceFilter, + tbl = tbl.mutate(capture=tbl["capture"].fillna("reporter")).select( + ["capture", "parent_id", "chrom", "viewpoint", "pe"] ) - filters = { - "capture": CCSliceFilter, - "tri": TriCSliceFilter, - "tiled": TiledCSliceFilter, - } - slice_filterer = filters.get(method) - - if file_type == "tsv": - df_slices = pd.read_csv(slices, sep="\t") - - try: - slice_filterer( - df_slices, sample_name=sample_name, read_type=read_type - ).cis_or_trans_stats.to_csv(output, index=False) - except Exception as e: - logging.info(f"Exception: {e} occured with {slices}") - touch_file(output) - - elif file_type == "hdf5": - - slices_iterator = pd.read_hdf(slices, "slices", chunksize=1e6, iterator=True) - slice_stats = pd.DataFrame() - - for df in slices_iterator: - sf = slice_filterer(df, sample_name=sample_name, read_type=read_type) - stats = sf.cis_or_trans_stats - - slice_stats = ( - pd.concat([slice_stats, stats]) - .groupby(["viewpoint", "cis/trans", "sample", "read_type"]) - .sum() - .reset_index() - ) + if assay in ["capture", "tri"]: + tbl_reporter = tbl[(tbl["capture"] == "reporter")].drop( + "viewpoint", "pe", "capture" + ) - slice_stats.to_csv(output, index=False) + tbl_capture = tbl[~(tbl["capture"] == "reporter")] - elif file_type == "parquet": + tbl_merge = tbl_capture.join( + tbl_reporter, + predicates=[ + "parent_id", + ], + lname="{name}_capture", + rname="{name}_reporter", + how="left", + ) - import dask.dataframe as dd - import dask.distributed + tbl_merge = tbl_merge.mutate( + is_cis=(tbl_merge["chrom_capture"] == tbl_merge["chrom_reporter"]) + ) - with dask.distributed.Client( - n_workers=n_cores, - dashboard_address=None, - processes=True, - scheduler_port=0, - local_directory=os.environ.get("TMPDIR", "/tmp/"), - memory_limit=memory_limit, - ) as client: + df_cis_and_trans = ( + tbl_merge.group_by(["viewpoint", "is_cis", "pe"]) + .aggregate( + count=_.count(), + ) + .execute(limit=None) + ) - ddf = dd.read_parquet(slices, engine="pyarrow") + else: + viewpoint_chroms = ( + tbl.filter(tbl["capture"] != "reporter") + .group_by(["viewpoint", "chrom"]) + .aggregate(chrom_count=_.count()) + .order_by(ibis.desc("chrom_count")) + .to_pandas() + .drop_duplicates("viewpoint") + .set_index("viewpoint") + .to_dict()["chrom"] + ) - ddf_cis_trans_stats = ddf.map_partitions( - lambda df: slice_filterer( - df, sample_name=sample_name, read_type=read_type - ).cis_or_trans_stats - ) + chrom_mapping_exp = ibis.case() + for k, v in viewpoint_chroms.items(): + chrom_mapping_exp = chrom_mapping_exp.when(tbl.viewpoint == k, v) + chrom_mapping_exp = chrom_mapping_exp.end() + + df_cis_and_trans = ( + tbl.mutate(cis_chrom=chrom_mapping_exp) + .mutate(is_cis=_.chrom == _.cis_chrom) + .group_by(["viewpoint", "parent_id", "is_cis", "pe"]) + .aggregate(count=_.count()) + .group_by(["viewpoint", "is_cis", "pe"]) + .aggregate(count=_["count"].sum()) + .execute(limit=None) + ) - ddf_cis_trans_stats_summary = ( - ddf_cis_trans_stats.groupby( - ["viewpoint", "cis/trans", "sample", "read_type"] + df_cis_and_trans = ( + df_cis_and_trans.rename(columns={"pe": "read_type", "is_cis": "cis/trans"}) + .assign( + sample=sample_name, + **{ + "cis/trans": lambda df: df["cis/trans"].map( + {True: "cis", False: "trans"} ) - .sum() - .reset_index() - ) - ddf_cis_trans_stats_summary.to_csv(output, index=False, single_file=True) + }, + ) + .loc[lambda df: ~df["viewpoint"].isna()] + .sort_values(["viewpoint", "read_type", "cis/trans"]) + ) + + df_cis_and_trans.to_csv(output, index=False) @cli.command() @@ -215,7 +159,6 @@ def merge_capcruncher_slices( category_cols: List[str] = None, n_cores: int = 1, ): - import dask.dataframe as dd import dask.distributed @@ -225,8 +168,7 @@ def merge_capcruncher_slices( processes=True, scheduler_port=0, local_directory=os.environ.get("TMPDIR", "/tmp/"), - ) as client: - + ) as _client: storage_kwargs = {} output_format = get_file_type(outfile) @@ -273,8 +215,6 @@ def merge_capcruncher_slices( ) elif output_format == "parquet": - - import pyarrow as pa import pyarrow.dataset as ds datasets = ds.dataset([ds.dataset(fn) for fn in infiles]) @@ -313,7 +253,6 @@ def viewpoint_coordinates( recognition_site: str = "dpnii", output: os.PathLike = "viewpoint_coordinates.bed", ): - # import dask.distributed import concurrent.futures from capcruncher.cli import genome_digest @@ -328,7 +267,6 @@ def viewpoint_coordinates( viewpoints_aligned_bam = NamedTemporaryFile("r+") with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor: - # Digest genome to find restriction fragments digestion = executor.submit( genome_digest.digest, diff --git a/capcruncher/cli/fastq_deduplicate.py b/capcruncher/cli/fastq_deduplicate.py index 976f3bbb..5eca3bcc 100644 --- a/capcruncher/cli/fastq_deduplicate.py +++ b/capcruncher/cli/fastq_deduplicate.py @@ -6,21 +6,16 @@ """ import multiprocessing import os -import sys -from collections import Counter -from multiprocessing import SimpleQueue -from typing import List, Tuple, Union - -import click +from typing import Tuple import numpy as np import pandas as pd import tqdm -from capcruncher.tools.deduplicate import (ReadDeduplicationParserProcess, - ReadDuplicateRemovalProcess) -from capcruncher.tools.io import FastqReaderProcess, FastqWriterProcess -from capcruncher.tools.statistics import DeduplicationStatistics +from capcruncher.api.deduplicate import ( + ReadDeduplicationParserProcess, + ReadDuplicateRemovalProcess, +) +from capcruncher.api.io import FastqReaderProcess, FastqWriterProcess from capcruncher.utils import get_file_type, load_dict, save_dict -from xopen import xopen def parse(input_files: Tuple, output: os.PathLike = "out.json", read_buffer: int = 1e5): @@ -88,7 +83,7 @@ def identify(input_files: Tuple, output: os.PathLike = "duplicates.json"): for name_hash, sequence_hash in fastq_hashed_dict.items(): - if not sequence_hash in sequences_dedup: + if sequence_hash not in sequences_dedup: sequences_dedup.add(sequence_hash) else: reads_duplicated.add(name_hash) @@ -184,7 +179,6 @@ def remove( writer.start() reader.join() - for _ in range(n_cores): readq.put_nowait(None) diff --git a/capcruncher/cli/fastq_digest.py b/capcruncher/cli/fastq_digest.py index 3fe3abf4..c03ae616 100644 --- a/capcruncher/cli/fastq_digest.py +++ b/capcruncher/cli/fastq_digest.py @@ -1,13 +1,13 @@ -from builtins import breakpoint import multiprocessing import os from typing import Tuple from multiprocessing import Queue -from capcruncher.tools.digest import ReadDigestionProcess -from capcruncher.tools.io import FastqReaderProcess, FastqWriterProcess -from capcruncher.utils import get_re_site +from capcruncher.api.digest import ReadDigestionProcess +from capcruncher.api.io import FastqReaderProcess, FastqWriterProcess +from capcruncher.api.digest import get_re_site import pandas as pd + def digest( input_fastq: Tuple, restriction_enzyme: str, @@ -77,7 +77,7 @@ def digest( allow_undigested=True, stats_pipe=stats_pipe[0], ) - + else: raise ValueError("Wrong number of files specified. Flashed == 1, PE == 2") @@ -148,12 +148,11 @@ def digest( ["n_slices", "count"] ) - #breakpoint() + # breakpoint() # Read summary - reads that pass the filter df_stats = ( - df_hist - .query("(n_slices >= 1) or (filtered == False) or (read_type == 'pe')") + df_hist.query("(n_slices >= 1) or (filtered == False) or (read_type == 'pe')") .query("read_number < 2") .groupby(["sample", "read_type", "read_number", "filtered"])["count"] .sum() diff --git a/capcruncher/cli/fastq_split.py b/capcruncher/cli/fastq_split.py index 0799e440..0655050d 100644 --- a/capcruncher/cli/fastq_split.py +++ b/capcruncher/cli/fastq_split.py @@ -8,8 +8,7 @@ Script splits a fastq into specified chunks """ -import itertools -import logging +from loguru import logger from multiprocessing import SimpleQueue from typing import Tuple import subprocess @@ -17,44 +16,56 @@ import os import re from joblib import Parallel, delayed +from typing import Literal + + +def run_unix_split( + fn: os.PathLike, + n_reads: int, + read_number: int, + output_prefix: os.PathLike = "", + gzip: bool = False, + n_cores=1, + suffix: str = "", + **kwargs, +): + statement = [] -def run_unix_split(fn: os.PathLike, - n_reads: int, - read_number: int, - output_prefix: os.PathLike = '', - gzip: bool = False, - compression_level: int = 5, - n_cores=1): + if suffix: + split_suffix = f"{suffix}_{read_number}.fastq" + else: + split_suffix = f"_{read_number}.fastq" - statement = [] + cmd = f"""zcat {fn} | split FILTER -l {n_reads * 4} -d --additional-suffix={split_suffix} - {output_prefix}_part;""" - cmd = f"""zcat {fn} | split FILTER -l {n_reads * 4} -d --additional-suffix=_{read_number}.fastq - {output_prefix}_part;""" - - if not ".gz" in fn: + if ".gz" not in fn: cmd = cmd.replace("zcat", "cat") - + if gzip: cmd = cmd.replace("FILTER", f"--filter='pigz -p {n_cores} > $FILE.gz'") else: cmd = cmd.replace("FILTER", "") - + statement.append(cmd) - logging.info(f"Running: {cmd}") - subprocess.run(' '.join(statement), shell=True) + logger.info(f"Running: {cmd}") + subprocess.run(" ".join(statement), shell=True) def split( input_files: Tuple, - method: str = "unix", + method: Literal["python", "unix", "seqkit"] = "unix", + split_type: Literal["n-reads", "n-parts"] = "n-reads", output_prefix: os.PathLike = "split", compression_level: int = 5, n_reads: int = 1000000, + n_parts: int = 1, + suffix: str = "", gzip: bool = True, n_cores: int = 1, ): - """ + """ Splits fastq file(s) into equal chunks of n reads. Will now need "," between files of the same read. @@ -67,16 +78,16 @@ def split( compression_level (int, optional): Compression level for gzipped output. Defaults to 5. n_reads (int, optional): Number of reads to split the input fastq files into. Defaults to 1000000. gzip (bool, optional): Gzip compress output files if True. Defaults to True. - - """ - from capcruncher.tools.io import ( - FastqReaderProcess, - FastqWriterSplitterProcess, - FastqReadFormatterProcess,) + """ + from capcruncher.api.io import ( + FastqReaderProcess, + FastqWriterSplitterProcess, + FastqReadFormatterProcess, + ) - if method == "python": + if split_type == "n-reads" and method == "python": readq = SimpleQueue() writeq = SimpleQueue() @@ -111,32 +122,42 @@ def split( proc.join() proc.terminate() - elif method == "unix": # Using unix split to perform the splitting + elif ( + split_type == "n-reads" and method == "unix" + ): # Using unix split to perform the splitting tasks = [] n_cores_per_task = (n_cores // 2) if (n_cores // 2) > 1 else 1 - if "," in input_files[0]: # Allows for specifying multiple files + if "," in input_files[0]: # Allows for specifying multiple files input_files = [fnames.replace(",", " ") for fnames in input_files] for ii, fn in enumerate(input_files): - t = delayed(run_unix_split)(fn, - n_reads=n_reads, - read_number=ii+1, - gzip=gzip, - compression_level=compression_level, - output_prefix=output_prefix, - n_cores=n_cores_per_task) - + t = delayed(run_unix_split)( + fn, + n_reads=n_reads, + read_number=ii + 1, + gzip=gzip, + compression_level=compression_level, + output_prefix=output_prefix, + n_cores=n_cores_per_task, + suffix=suffix, + ) + tasks.append(t) - + # Run splitting Parallel(n_jobs=2 if n_cores > 1 else 1)(tasks) - # The suffixes are in the format 00, 01, 02 etc need to replace with int for fn in glob.glob(f"{output_prefix}_part*"): src = fn - part_no = int(re.match(r"(?:.*)_part(\d+)_[1|2].*", fn).group(1)) + part_no = int( + re.match(r"(?:.*)_part(\d+)_.*([1|2])?.fastq(.gz)?", fn).group(1) + ) dest = re.sub(r"_part\d+_", f"_part{part_no}_", src) os.rename(src, dest) + + # elif split_type == "n-reads" and method == "seqkit": + + # cmd = ["seqkit", "split2", "-1", input] diff --git a/capcruncher/cli/genome_digest.py b/capcruncher/cli/genome_digest.py index b0f15767..185183f8 100644 --- a/capcruncher/cli/genome_digest.py +++ b/capcruncher/cli/genome_digest.py @@ -9,11 +9,10 @@ """ import pysam import xopen -from capcruncher.tools.digest import DigestedChrom -from capcruncher.utils import get_re_site +from capcruncher.api.digest import DigestedChrom, get_re_site from typing import Iterator import os -import logging +from loguru import logger import pandas as pd @@ -67,7 +66,7 @@ def digest( for chrom in parse_chromosomes(input_fasta): - logging.info(f"Processing chrom {chrom.name}") + logger.info(f"Processing chrom {chrom.name}") digested_chrom = DigestedChrom( chrom, @@ -78,7 +77,7 @@ def digest( for n_fragments, fragment in enumerate(digested_chrom.fragments): if n_fragments % 10000 == 0: - logging.info(f"Written {n_fragments} fragments") + logger.info(f"Written {n_fragments} fragments") output.write(fragment) @@ -86,7 +85,7 @@ def digest( fragment_number += n_fragments + 1 if sort: - logging.info("Sorting output") + logger.info("Sorting output") df = pd.read_csv(output_file, sep="\t", names=["chrom", "start", "end", "name"]) # If changing the order, also need to change the fragment number diff --git a/capcruncher/cli/reporters_compare.py b/capcruncher/cli/interactions_compare.py similarity index 84% rename from capcruncher/cli/reporters_compare.py rename to capcruncher/cli/interactions_compare.py index 5c7b9c6a..bc37ba8b 100644 --- a/capcruncher/cli/reporters_compare.py +++ b/capcruncher/cli/interactions_compare.py @@ -1,18 +1,15 @@ import itertools -import logging +from loguru import logger import os import re -import sys -from typing import Literal, Tuple +from typing import Literal, Tuple, List, Union, Dict import cooler -import numpy as np import pandas as pd -from capcruncher.tools.pileup import CoolerBedGraph +from capcruncher.api.pileup import CoolerBedGraph from capcruncher.utils import get_cooler_uri from joblib import Parallel, delayed from pybedtools import BedTool -from collections import defaultdict def get_bedgraph_name_from_cooler(cooler_filename): @@ -41,9 +38,10 @@ def concat( if not viewpoint: viewpoints = [vp.strip("/") for vp in cooler.fileops.list_coolers(infiles[0])] else: - viewpoints = [viewpoint, ] + viewpoints = [ + viewpoint, + ] - union_by_viewpoint = dict() for viewpoint in viewpoints: @@ -56,8 +54,12 @@ def concat( delayed( lambda uri: ( get_bedgraph_name_from_cooler(uri), - CoolerBedGraph(uri, region_to_limit=region if region else None) - .extract_bedgraph(normalisation=normalisation, **norm_kwargs) + CoolerBedGraph( + uri, region_to_limit=region if region else None + ) + .extract_bedgraph( + normalisation=normalisation, **norm_kwargs + ) .pipe(BedTool.from_dataframe), ) )(uri) @@ -85,7 +87,7 @@ def concat( union.to_csv(output, sep="\t", index=False) union_by_viewpoint[viewpoint] = union - + return union_by_viewpoint @@ -109,7 +111,12 @@ def get_summary_functions(methods): return summary_functions -def get_groups(columns, group_names, group_columns): +def get_groups( + columns: Union[pd.Index, list], + group_names: List[str], + group_columns: List[Union[str, int]], +) -> Dict[str, str]: + """Extracts groups from group_columns and returns a dictionary of column names to group names.""" groups = dict() @@ -119,7 +126,7 @@ def get_groups(columns, group_names, group_columns): try: col = int(col) col_name = columns[col] - except Exception as e: + except Exception: col_name = col groups[col_name] = group_name @@ -127,12 +134,6 @@ def get_groups(columns, group_names, group_columns): return groups -# def get_grouped_dataframe( -# df: pd.DataFrame, group_name: str, groups: dict, agg_func: function -# ): -# return df.loc[:, groups[group_name]].pipe(agg_func, axis=1) - - def summarise( infile: os.PathLike, output_prefix: os.PathLike = None, @@ -144,23 +145,23 @@ def summarise( subtraction: bool = False, ): - logging.info(f"Reading {infile}") + logger.info(f"Reading {infile}") df_union = pd.read_csv(infile, sep="\t") df_counts = df_union.iloc[:, 3:] summary_functions = get_summary_functions(summary_methods) - logging.info("Identifying groups") + logger.info("Identifying groups") groups = ( get_groups(df_counts.columns, group_names, group_columns) if group_names else {col: "summary" for col in df_counts.columns} ) # Use all columns if no groups provided - logging.info(f"Extracted groups: {groups}") + logger.info(f"Extracted groups: {groups}") # Perform all groupby aggregations. - logging.info(f"Performing all aggregations: {summary_methods}") + logger.info(f"Performing all aggregations: {summary_methods}") df_agg = ( df_union.iloc[:, 3:] .transpose() # Transpose to enable groupby funcs @@ -175,14 +176,15 @@ def summarise( ) # Write out groupby aggregations - logging.info("Writing aggregations") + logger.info("Writing aggregations") + for group in group_names: if output_format == "bedgraph": df_output = df_agg[["chrom", "start", "end", group, "aggregation"]] - + for aggregation, df in df_output.groupby("aggregation"): - logging.info(f"Writing {group} {aggregation}") + logger.info(f"Writing {group} {aggregation}") df.drop(columns="aggregation").to_csv( f"{output_prefix}{group}.{aggregation}-summary{suffix}.bedgraph", sep="\t", @@ -194,7 +196,7 @@ def summarise( df_output.to_csv(f"{output_prefix}{group}{suffix}.tsv") # Perform permutations - logging.info("Performing subtractions") + logger.info("Performing subtractions") if subtraction: subtractions_performed = list() for group_a, group_b in itertools.permutations(group_names, 2): @@ -205,7 +207,7 @@ def summarise( if output_format == "bedgraph": for sub in subtractions_performed: - logging.info(f"Writing {output_prefix} {sub} {aggregation}") + logger.info(f"Writing {output_prefix} {sub} {aggregation}") df_output = df_agg[["chrom", "start", "end", sub, "aggregation"]] for aggregation, df in df_output.groupby("aggregation"): df.drop(columns="aggregation").to_csv( diff --git a/capcruncher/cli/interactions_count.py b/capcruncher/cli/interactions_count.py new file mode 100644 index 00000000..916241b0 --- /dev/null +++ b/capcruncher/cli/interactions_count.py @@ -0,0 +1,165 @@ +import os +from loguru import logger +import tempfile +import glob +import more_itertools +import multiprocessing + +from capcruncher.api.io import ( + CCParquetReaderProcess, + FragmentCountingProcess, + CCCountsWriterProcess, +) +from capcruncher.api.storage import merge_coolers + + +def get_number_of_reader_threads(n_cores): + threads = n_cores // 2 + + if 1 < threads < 4: + threads = threads + elif threads < 1: + threads = 1 + + return threads + + +def count( + reporters: os.PathLike, + output: os.PathLike = "counts.tsv", + file_type: str = "auto", + remove_exclusions: bool = False, + remove_capture: bool = False, + subsample: int = 0, + low_memory: bool = False, + chunksize: int = 2e6, + output_as_cooler: bool = False, + fragment_map: os.PathLike = None, + viewpoint_path: os.PathLike = None, + n_cores: int = 1, +): + """ + Determines the number of captured restriction fragment interactions genome wide. + + Parses a reporter slices tsv and counts the number of unique restriction fragment + interaction combinations that occur within each fragment. + + Options to ignore unwanted counts e.g. excluded regions or capture fragments are provided. + In addition the number of reporter fragments can be subsampled if required. + + \f + Args: + reporters (os.PathLike): Reporter tsv file path. + output (os.PathLike, optional): Output file path for interaction counts tsv. Defaults to 'counts.tsv'. + remove_exclusions (bool, optional): Removes regions marked as capture proximity exclusions before counting. Defaults to False. + remove_capture (bool, optional): Removes all capture fragments before counting. Defaults to False. + subsample (int, optional): Subsamples the fragments by the specified fraction. Defaults to 0 i.e. No subsampling. + """ + + import pyarrow + import pandas as pd + + logger.info(f"Examining viewpoints from parquet file: {reporters}") + # Unsure of the best way to do this. Will just load the first partion vp column and extract + + df = pd.read_parquet(reporters, engine="pyarrow", columns=["viewpoint"]) + viewpoints = df["viewpoint"].cat.categories.to_list() + viewpoint_sizes = df["viewpoint"].value_counts() + + logger.info(f"Number of viewpoints: {len(viewpoints)}") + logger.info(f"Number of slices per viewpoint: {viewpoint_sizes.to_dict()}") + + MAX_SLICE_NUMBER = 5e6 + + if ( + viewpoint_sizes[viewpoint_sizes > MAX_SLICE_NUMBER].shape[0] == 0 + ): # Less than MAX_SLICE_NUMBER slices, read in batch + mode = "batch" + else: + # More than MAX_SLICE_NUMBER slices, read by partition + mode = "partition" + + # Multiprocessing queues + viewpoints_queue = multiprocessing.Queue() + slices_queue = multiprocessing.Queue() + counts_queue = multiprocessing.Queue() + + reader = CCParquetReaderProcess( + path=reporters, + inq=viewpoints_queue, + outq=slices_queue, + selection_mode=mode, + ) + + # Multiprocessing set-up + n_reading_threads = get_number_of_reader_threads(n_cores=n_cores) + pyarrow.set_cpu_count(n_reading_threads) + + n_worker_processes = (n_cores - n_reading_threads) // 2 + n_counting_processes = n_worker_processes if n_worker_processes > 1 else 1 + n_writing_processes = n_counting_processes + + logger.info(f"Starting {n_reading_threads} reader threads") + + counters = [ + FragmentCountingProcess( + inq=slices_queue, + outq=counts_queue, + remove_capture=remove_capture, + remove_exclusions=remove_exclusions, + subsample=subsample, + ) + for _ in range(n_counting_processes) + ] + + tmpdir = tempfile.TemporaryDirectory() + writers = [ + CCCountsWriterProcess( + inq=counts_queue, + output_format="cooler" if output_as_cooler else file_type, + restriction_fragment_map=fragment_map, + viewpoint_path=viewpoint_path, + tmpdir=tmpdir.name, + ) + for i in range(n_writing_processes) + ] + + # Start all processes + processes = [*writers, *counters, reader] + for process in processes: + process.start() + + for vp_chunk in more_itertools.chunked(viewpoints, 10): + viewpoints_queue.put(vp_chunk) + + logger.info("Finished loading viewpoints") + viewpoints_queue.put(None) # Sentinel value to signal end of viewpoints + reader.join() + + # End the counting inqueue + for _ in range(n_cores): + slices_queue.put((None, None)) + + # Join the counters + for counter in counters: + counter.join() + + # End the counting queue + for _ in range(n_counting_processes): + counts_queue.put((None, None)) + + logger.info("Finished counting") + + # Join the writers + for writer in writers: + writer.join() + + logger.info("Finished writing") + + # Merge the output files together + # TODO: Allow other than cooler outputs + logger.info(f"Making final cooler at {output}") + output_files = glob.glob(os.path.join(tmpdir.name, "*.hdf5")) + merge_coolers(output_files, output=output) + + tmpdir.cleanup() diff --git a/capcruncher/cli/interactions_deduplicate.py b/capcruncher/cli/interactions_deduplicate.py new file mode 100644 index 00000000..016842fe --- /dev/null +++ b/capcruncher/cli/interactions_deduplicate.py @@ -0,0 +1,314 @@ +import pandas as pd +import os +import ibis +from ibis import _ +import pyarrow.dataset as ds +import shutil +from loguru import logger + +ibis.options.interactive = False + + +def deduplicate( + slices: os.PathLike, + output: os.PathLike, + read_type: str = "flashed", + sample_name: str = "", + stats_prefix: os.PathLike = "", +): + logger.info("Connecting to DuckDB") + con = ibis.duckdb.connect() + + if not os.path.isdir(slices): + slices_tbl_raw = con.register(f"parquet://{slices}", table_name="slices_tbl") + else: + slices_tbl_raw = con.register( + f"parquet://{slices}/*.parquet", table_name="slices_tbl" + ) + + if read_type == "pe": + logger.info("Read type is PE") + logger.info("Identifying unique fragment IDs") + query = ( + slices_tbl_raw[["chrom", "start", "end", "parent_id"]] + .order_by(["chrom", "start", "end", "parent_id"]) + .group_by(by="parent_id", order_by=["chrom", "start", "end"]) + .order_by(["chrom", "start", "end", "parent_id"]) + .mutate( + slice_f_chrom=_.chrom.first(), + slice_f_start=_.start.first(), + slice_l_end=_.end.last(), + ) + .group_by(["slice_f_chrom", "slice_f_start", "slice_l_end"]) + .mutate(pid=_.parent_id.first())[["pid"]] + .distinct()["pid"] + ) + elif read_type == "flashed": + logger.info("Read type is Flashed") + logger.info("Identifying unique fragment IDs") + + query = ( + slices_tbl_raw[["coordinates", "parent_id"]] + .group_by(by="parent_id", order_by=["coordinates"]) + .aggregate(coordinates=lambda t: t.coordinates.group_concat(",")) + .group_by("coordinates") + .mutate(parent_id_unique=_.parent_id.first())[["parent_id_unique"]] + .distinct()["parent_id_unique"] + ) + + parent_ids_unique = query.execute(limit=None) + + logger.info("Writing deduplicated slices to disk") + slices_unfiltered_ds = ds.dataset(slices, format="parquet") + scanner = slices_unfiltered_ds.scanner( + filter=ds.field("parent_id").isin(parent_ids_unique) + ) + + if os.path.exists(output): + shutil.rmtree(output) + + ds.write_dataset( + scanner, + output, + format="parquet", + partitioning_flavor="hive", + min_rows_per_group=0, + ) + + # If the output directory is empty, create a dummy file to prevent downstream errors + if not os.path.exists(output): + os.makedirs(output) + df_dummy = scanner.to_table().to_pandas() + df_dummy.to_parquet(f"{output}/dummy.parquet") + + logger.info("Calculating deduplication stats") + # Calculate the number of slices in the input + + n_reads_total = ( + slices_tbl_raw.group_by("parent_id") + .agg([_.count().name("count")])["count"] + .sum() + .execute(limit=None) + ) + + # Calculate the number of slices in the output + n_reads_unique = parent_ids_unique.shape[0] + + # Prepare stats + df_stats = pd.DataFrame() + df_stats["stat_type"] = ["not-deduplicated", "deduplicated"] + df_stats["stat"] = [n_reads_total, n_reads_unique] + df_stats["sample"] = sample_name + df_stats["read_type"] = read_type + df_stats["read_number"] = 0 + df_stats["stage"] = "deduplicate_slices" + df_stats["stat"] = df_stats["stat"].fillna(0) + + logger.info("Deduplication stats:") + logger.info(f"\n{df_stats.to_string()}") + + logger.info("Writing deduplication stats to disk") + df_stats.to_csv(f"{stats_prefix}.read.stats.csv", index=False) + + return df_stats + + +# def identify( +# fragments: tuple, +# output: os.PathLike = "duplicated_ids.json", +# viewpoint: str = "", +# buffer: int = 1e6, +# read_type: str = "flashed", +# n_cores: int = 1, +# memory_limit: str = "1G", +# ): +# """ +# Identifies aligned fragments with duplicate coordinates. + +# Parses a tsv file containing filtered aligned fragments and generates a dictionary containing +# the hashed parental read id and hashed genomic coordinates of all slices. Duplicated fragments +# are implicitly removed if they share the same genomic coordinate hash. + +# For non-combined reads (pe) a genomic coordinate hash is generated from the start of the first slice and +# the end of the last slice. This is due to the decreased confidence in the quality of the centre of the fragment. +# The coordinate hash for combined reads (flashed) is generated directly from the fragment coordinates. Only +# fragments with the exact coordinates and slice order will be considered to be duplicates. + +# Identified duplicate fragments are output in json format to be used by the "remove" subcommand. + +# \f +# Args: +# fragments (os.PathLike): Input fragments parquet dataset to identify duplicates. +# output (os.PathLike, optional): Output path to output duplicated parental read ids. Defaults to "duplicated_ids.json". +# buffer (int, optional): Number of fragments to process in memory. Defaults to 1e6. +# read_type (str, optional): Process combined(flashed) or non-combined reads (pe). +# Due to the low confidence in the quaility of pe reads, duplicates are identified by +# removing any fragments with matching start and end coordinates. +# Defaults to "flashed". + +# """ + +# con = ibis.duckdb.connect() +# data = con.register(f"parquet://{fragments}/*", table_name="fragments_tbl") +# tbl = con.table("fragments_tbl") +# t = (tbl +# [["chrom", "start", "end", "parent_id"]] +# .order_by(["chrom", "start", "end", "parent_id"]) +# .groupby(by="parent_id", order_by=["chrom", "start", "end"]) +# .mutate(slice_f_chrom=_.chrom.first(), +# slice_f_start=_.start.first(), +# slice_l_end=_.end.last()) +# .groupby(["slice_f_chrom", "slice_f_start", "slice_l_end"]) +# .mutate(pid=_.parent_id.first()) +# ["pid"] +# ) + + +# # if input_file_type in ["hdf5", "parquet"]: +# # # Will use dask for these +# # cluster = dask.distributed.LocalCluster( +# # n_workers=n_cores, +# # threads_per_worker=1, +# # dashboard_address=None, +# # processes=True, +# # scheduler_port=0, +# # local_directory=os.environ.get("TMPDIR", "/tmp/"), +# # memory_limit=memory_limit, +# # ) + +# # with dask.distributed.Client(cluster) as client: + +# # # Extract duplicated fragment ids +# # if input_file_type == "tsv": +# # if len(fragments) > 1: +# # raise NotImplementedError("Currently just supports a single tsv input") +# # else: +# # duplicated_fragments = identify_coordinate_duplicates_from_tsv( +# # fragments[0], read_type=read_type, buffer=buffer +# # ) +# # elif input_file_type == "hdf5": +# # outfile = f"{output.replace('.hdf5', '')}.hdf5" +# # if os.path.exists(outfile): +# # os.remove(outfile) + +# # duplicated_fragments = identify_coordinate_duplicates_from_hdf5( +# # fragments, read_type=read_type +# # ) + +# # elif input_file_type == "parquet": +# # duplicated_fragments = identify_coordinate_duplicates_from_parquet( +# # fragments, read_type=read_type +# # ) + +# # else: +# # raise ValueError(f"Input file type {file_type} not supported") + +# # # Output +# # if output_file_type == "json": +# # with xopen.xopen(output, "w") as w: +# # ujson.dump(dict.fromkeys(duplicated_fragments), w) + +# # elif output_file_type == "hdf5": + +# # duplicated_fragments.to_hdf( +# # outfile, f"/duplicated_ids", min_itemsize={"id": 25}, index=False +# # ) + +# # elif output_file_type == "pickle": +# # duplicated_fragments = duplicated_fragments.compute() +# # duplicated_fragments.to_pickle(output) + + +# def remove( +# slices: os.PathLike, +# duplicated_ids: os.PathLike, +# output: os.PathLike = "dedup.slices.tsv.gz", +# buffer: int = 5e6, +# sample_name: str = "", +# read_type: str = "", +# stats_prefix: os.PathLike = "", +# file_type: str = "hdf5", +# n_cores: int = 1, +# memory_limit: str = "1G", +# ): +# """ +# Removes duplicated aligned fragments. + +# Parses a tsv file containing aligned read slices and outputs only slices from unique fragments. +# Duplicated parental read id determined by the "identify" subcommand are located within the +# slices tsv file and removed. + +# Outputs statistics for the number of unique slices and the number of duplicate slices identified. + +# \f +# Args: +# slices_fn (os.PathLike): Input slices.tsv file. +# duplicated_ids (os.PathLike): Duplicated parental read ids in json format. +# output (os.PathLike, optional): Output file path for deduplicated slices. Defaults to "dedup.slices.tsv.gz". +# buffer (int, optional): Number of slices to process in memory. Defaults to 1e6. +# sample_name (str, optional): Name of sample being processed e.g. DOX-treated_1 used for statistics. Defaults to "". +# read_type (str, optional): Process combined(flashed) or non-combined reads (pe) used for statistics. Defaults to "". +# stats_prefix (os.PathLike, optional): Output path for deduplication statistics. Defaults to "". +# """ + +# input_file_type = ( +# get_file_type(slices if isinstance(slices, str) else slices[0]) +# if file_type == "auto" +# else file_type +# ) +# output_file_type = get_file_type(output) + +# duplicates = read_duplicated_ids(duplicated_ids) + +# if input_file_type in ["hdf5", "parquet"]: +# # Will use dask for these +# cluster = dask.distributed.LocalCluster( +# n_workers=n_cores, +# threads_per_worker=1, +# dashboard_address=None, +# processes=True, +# scheduler_port=0, +# local_directory=os.environ.get("TMPDIR", "/tmp/"), +# memory_limit=memory_limit, +# ) +# with dask.distributed.Client(cluster) as client: + +# if input_file_type == "tsv": + +# if not output_file_type == "tsv": +# raise NotImplementedError("Currently only tsv -> tsv output supported") + +# n_reads_total, n_reads_unique = remove_duplicates_from_tsv( +# slices, output, duplicates, buffer=buffer +# ) + +# elif input_file_type == "hdf5": + +# if not output_file_type == "hdf5": +# raise NotImplementedError("Currently only hdf5 -> hdf5 output supported") + +# n_reads_total, n_reads_unique = remove_duplicates_from_hdf5( +# slices, duplicates, output +# ) + +# elif input_file_type == "parquet": +# if not output_file_type == "parquet": +# raise NotImplementedError( +# "Currently only parquet -> parquet output supported" +# ) + +# n_reads_total, n_reads_unique = remove_duplicates_from_parquet( +# slices, duplicates, output +# ) + +# # Prepare stats +# df_stats = pd.DataFrame() +# df_stats["stat_type"] = ["not-deduplicated", "deduplicated"] +# df_stats["stat"] = [n_reads_total, n_reads_unique] +# df_stats["sample"] = sample_name +# df_stats["read_type"] = read_type +# df_stats["read_number"] = 0 +# df_stats["stage"] = "deduplicate_slices" +# df_stats.to_csv(f"{stats_prefix}.read.stats.csv", index=False) + +# return df_stats diff --git a/capcruncher/cli/interactions_differential.py b/capcruncher/cli/interactions_differential.py new file mode 100644 index 00000000..4b36fae5 --- /dev/null +++ b/capcruncher/cli/interactions_differential.py @@ -0,0 +1,308 @@ +"""Identifies differential interactions between conditions.""" + +import os +import pandas as pd +import itertools +from loguru import logger + + +import ray +from pydeseq2.dds import DeseqDataSet +from pydeseq2.ds import DeseqStats +from capcruncher.api.pileup import cooler_to_bedgraph + + +@ray.remote +def get_differential_interactions( + counts: pd.DataFrame, + design: pd.DataFrame, + contrast: str, + threshold_q: float = 0.05, + lfc_shrink: bool = True, +): + """Runs DESeq2 on interaction counts.""" + # Create DeseqDataSet + dds = DeseqDataSet( + counts=counts.T, + clinical=design, + design_factors=contrast, + ) + + # Run DESeq2 + dds.deseq2() + + # Get results + results = DeseqStats(dds) + df_results = results.summary() + + if lfc_shrink: + df_results = results.lfc_shrink() + + # Filter results + df_results = df_results.loc[lambda df: df["padj"] <= threshold_q] + + # Sort results + df_results = ( + df_results.assign(log2FoldChangeAbs=lambda df: df["log2FoldChange"].abs()) + .sort_values(by=["log2FoldChangeAbs", "padj"], ascending=[False, True]) + .drop(columns=["log2FoldChangeAbs"]) + ) + + # Add coordinates + df_results = df_results.assign( + chrom=lambda df: df.index.str.split(":").str[0], + start=lambda df: df.index.str.split(":") + .str[1] + .str.split("-") + .str[0] + .astype(int), + end=lambda df: df.index.str.split(":").str[1].str.split("-").str[1].astype(int), + ) + + return df_results + + +def differential( + interaction_files: list, + viewpoint: str, + design_matrix: os.PathLike, + output_prefix: os.PathLike = "differential_interactions", + contrast: str = "condition", + regions_of_interest: os.PathLike = None, + viewpoint_distance: int = None, + threshold_count: float = 20, + threshold_q: float = 0.05, +): + """Identifies differential interactions between conditions. + + Parses a list of cooler files containg reporter counts from at least two conditions with + two or more replicates for a single capture probe and outputs differential interaction + results. Following filtering to ensure that the number of interactions is above the required + threshold, PyDeseq2 is used to run a compatison after + fitting a negative binomial model to the interaction counts.The options to filter + results can be filtered by a minimum mean value (threshold_mean) and/or + maximum q-value (threshold-q) are also provided. + + + Args: + interaction_files (list): List of cooler files. + viewpoint (str): Name of capture probe. MUST match one viewpoint within the HDF5 files. + design_matrix (os.PathLike): Design matrix to use for grouping samples. (N_SAMPLES * METADATA). + output_prefix (os.PathLike, optional): Output prefix for differntial interactions. Defaults to 'differential'. + contrast (str, optional): Column to use for grouping. Defaults to 'condition'. + regions_of_interest (os.PathLike, optional): BED file of regions of interest. Defaults to None. + viewpoint_distance (int, optional): Distance from viewpoint to include. Defaults to 500_000. + threshold_count (float, optional): Minimum number of reported interactions required. Defaults to 20. + threshold_q (float, optional): Maximum q-value for output. Defaults to 0.05. + threshold_mean (float, optional): Minimum mean value for output. Defaults to 0. + """ + # Load design matrix + logger.info("Loading design matrix.") + df_design = pd.read_table( + design_matrix, index_col=0, sep="\s+|,|\t", engine="python" + ) + + # Set-up tasks for bedgraph generation + logger.info("Validating viewpoint distance and regions of interest.") + assert len(interaction_files) >= 2, "No interaction files provided." + assert ( + regions_of_interest or viewpoint_distance + ), "No regions of interest or viewpoint distance provided." + + logger.info("Extracting interaction counts.") + + if regions_of_interest: + logger.info( + f"Using supplied regions of interest file {regions_of_interest} to restrict analysis" + ) + else: + logger.info( + f"Using distance from viewpoint of {viewpoint_distance} to restrict analysis" + ) + + bedgraph_futures = dict() + for interaction_file in interaction_files: + file_name = os.path.basename(interaction_file.replace(".hdf5", "")) + future = cooler_to_bedgraph.remote( + clr=f"{interaction_file}::{viewpoint}", + regions_of_interest=regions_of_interest, + viewpoint_distance=viewpoint_distance, + ) + bedgraph_futures[file_name] = future + + # Execute tasks + bedgraphs = {k: ray.get(v) for k, v in bedgraph_futures.items()} + + logger.info("Concatenating interactions.") + # Concatenate bedgraphs + df_counts = pd.concat( + [ + bg.assign( + coord=lambda df: df["chrom"].astype(str) + + ":" + + df["start"].astype(str) + + "-" + + df["end"].astype(str) + ) + .set_index("coord") + .drop(columns=["chrom", "start", "end"]) + .rename(columns={"count": name}) + for name, bg in bedgraphs.items() + ], + axis=1, + ).fillna(0) + + # Filter out any interacting fragments with less than threshold_counts + logger.info(f"Removing interactions with less than {threshold_count} counts.") + df_counts = df_counts.loc[lambda df: (df >= threshold_count).all(axis=1)] + + # At the time of writing. PyDeseq2 doese not support multiple comparisons. + # Therefore, we need to run a separate DESeq2 analysis for each comparison. + + # Get all comparisons + possible_contrasts = df_design[contrast].unique() + comparisons = list(itertools.combinations(possible_contrasts, 2)) + + # Run comparisons + comparison_futures = dict() + for group_a, group_b in comparisons: + # Filter design matrix + df_design_sub = df_design.loc[lambda df: df[contrast].isin([group_a, group_b])] + + # Filter counts + df_counts_sub = df_counts.loc[:, df_design_sub.index] + + # Get differential interactions + result = get_differential_interactions.remote( + df_counts_sub, df_design_sub, contrast, threshold_q=threshold_q + ) + + comparison_futures[(group_a, group_b)] = result + + # Execute tasks + for (group_a, group_b), future in comparison_futures.items(): + logger.info(f"Running comparison: {group_a} vs {group_b}") + df_results = ray.get(future) + + # Write result + df_results.to_csv( + f"{output_prefix}.{group_a}_vs_{group_b}.csv", + sep=",", + index=True, + header=True, + ) + + +# def differential( +# union_bedgraph: os.PathLike, +# capture_name: str, +# capture_viewpoints: os.PathLike, +# output_prefix: os.PathLike = "differential", +# design_matrix: os.PathLike = None, +# grouping_col: str = "condition", +# threshold_count: float = 20, +# threshold_q: float = 0.05, +# threshold_mean: float = 0, +# ): + +# """ +# Identifies differential interactions between conditions. + +# Parses a union bedgraph containg reporter counts from at least two conditions with +# two or more replicates for a single capture probe and outputs differential interaction +# results. Following filtering to ensure that the number of interactions is above the required +# threshold (--threshold_count), diffxpy is used to run a wald test after +# fitting a negative binomial model to the interaction counts.The options to filter +# results can be filtered by a minimum mean value (threshold_mean) and/or +# maximum q-value (threshold-q) are also provided. + +# Notes: + +# Currently both the capture oligos and the name of the probe being analysed must +# be provided in order to correctly extract cis interactions. + +# If a N_SAMPLE * METADATA design matrix has not been supplied, the script +# assumes that the standard replicate naming structure has been followed +# i.e. SAMPLE_CONDITION_REPLICATE_(1|2).fastq.gz. + +# \f +# Args: +# union_bedgraph (os.PathLike): Union bedgraph containg all samples to be compared. +# capture_name (str): Name of capture probe. MUST match one probe within the supplied oligos. +# capture_viewpoints (os.PathLike): Capture oligos used for the analysis. +# output_prefix (os.PathLike, optional): Output prefix for differntial interactions. Defaults to 'differential'. +# design_matrix (os.PathLike, optional): Design matrix to use for grouping samples. (N_SAMPLES * METADATA). Defaults to None. +# grouping_col (str, optional): Column to use for grouping. Defaults to 'condition'. +# threshold_count (float, optional): Minimum number of reported interactions required. Defaults to 20. +# threshold_q (float, optional): Maximum q-value for output. Defaults to 0.05. +# threshold_mean (float, optional): Minimum mean value for output. Defaults to 0. +# """ + +# df_bdg = pd.read_csv(union_bedgraph, sep="\t") +# df_viewpoints = pd.read_csv( +# capture_viewpoints, sep="\t", names=["chrom", "start", "end", "name"] +# ) + +# # If design matrix present then use it. Else will assume that the standard format has been followed: +# # i.e. NAME_TREATMENT_REPLICATE +# if design_matrix: +# df_design = pd.read_csv(design_matrix, sep="\t") +# else: +# col_dict = {col: "_".join(col.split("_")[:-1]) for col in df_bdg.columns[3:]} +# df_design = pd.Series(col_dict).to_frame(grouping_col) + +# # Only cis interactions +# capture_chrom = get_chromosome_from_name(df_viewpoints, name=capture_name) +# df_bdg_counts = df_bdg.query(f'chrom == "{capture_chrom}"') + +# # Only counts +# df_bdg_counts = df_bdg_counts.iloc[:, 3:] + +# # Only with number of interactions > threshold per group in at least 2 replicates +# df_bdg_counts = ( +# df_bdg_counts.groupby(df_design[grouping_col], axis=1) +# .apply(lambda df: df[(df >= threshold_count).sum(axis=1) >= 2]) +# .fillna(0.0) +# ) + +# # Run differential testing +# count_data = df_bdg_counts.transpose().values.astype(np.float64) +# fragment_names = df_bdg_counts.index.values + +# tests = de.test.pairwise( +# count_data, +# grouping=grouping_col, +# sample_description=df_design, +# gene_names=fragment_names, +# test="wald", +# lazy=False, +# backend="numpy", +# ) + +# # Go through all of the pairwise tests +# for g1, g2 in itertools.combinations(tests.groups, 2): +# df_comparison = tests.summary_pairs( +# groups0=[ +# g1, +# ], +# groups1=[ +# g2, +# ], +# qval_thres=threshold_q, +# mean_thres=threshold_mean, +# ) + +# # Extract the fragment coords +# df_coords = df_bdg.loc[df_comparison["gene"], ["chrom", "start", "end"]] +# # Merge with test results +# df_comparison = df_comparison.merge(df_coords, left_on="gene", right_index=True) +# # Output to tsv +# ( +# df_comparison.drop(columns="gene")[ +# ["chrom", "start", "end", "mean", "log2fc", "pval", "qval"] +# ].to_csv(f"{output_prefix}_{g1}_vs_{g2}.tsv", sep="\t", index=False) +# ) + + +# # if __name__ == '__main__': +# # interactions_differential() diff --git a/capcruncher/cli/reporters_pileup.py b/capcruncher/cli/interactions_pileup.py similarity index 78% rename from capcruncher/cli/reporters_pileup.py rename to capcruncher/cli/interactions_pileup.py index db49ca6c..e1979adc 100644 --- a/capcruncher/cli/reporters_pileup.py +++ b/capcruncher/cli/interactions_pileup.py @@ -1,15 +1,14 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import logging +from loguru import logger import os import subprocess import tempfile -from typing import Literal, Union +from typing import Literal import cooler -from capcruncher.tools.pileup import CoolerBedGraph, CoolerBedGraphWindowed -from capcruncher.tools.storage import CoolerBinner +from capcruncher.api.pileup import CoolerBedGraph def pileup( @@ -48,10 +47,10 @@ def pileup( """ viewpoint_names = viewpoint_names or [ - v.strip("/") for v in cooler.fileops.list_coolers(uri) if not "resolutions" in v + v.strip("/") for v in cooler.fileops.list_coolers(uri) if "resolutions" not in v ] - - logging.info(f"Performing pileup for {viewpoint_names}") + + logger.info(f"Performing pileup for {viewpoint_names}") bin_bedgraph = True if binsize > 0 else False @@ -65,28 +64,28 @@ def pileup( try: cooler.fileops.is_cooler(cooler_group) except Exception as e: - logging.info(f"Exception {e} occured while looking for: {viewpoint_name}") + logger.info(f"Exception {e} occured while looking for: {viewpoint_name}") raise (f"Cannot find {viewpoint_name} in cooler file") - + bedgraph = CoolerBedGraph(uri=cooler_group, sparse=sparse).extract_bedgraph( normalisation=normalisation, region=normalisation_regions, scale_factor=scale_factor, ) - logging.info(f"Generated bedgraph for {viewpoint_name}") + logger.info(f"Generated bedgraph for {viewpoint_name}") if format == "bedgraph": bedgraph.to_csv( - f'{output_prefix}.{viewpoint_name}.bedgraph{".gz" if gzip else ""}', + f'{output_prefix}_{viewpoint_name}.bedgraph{".gz" if gzip else ""}', sep="\t", header=False, index=False, ) - + elif format == "bigwig": - + clr = cooler.Cooler(cooler_group) with tempfile.NamedTemporaryFile() as chromsizes_tmp: @@ -94,10 +93,11 @@ def pileup( clr.chromsizes.to_csv(chromsizes_tmp, sep="\t", header=False) bedgraph.to_csv(bedgraph_tmp, sep="\t", index=False, header=False) - result = subprocess.run(["bedGraphToBigWig", bedgraph_tmp.name, chromsizes_tmp.name, f"{output_prefix}.{viewpoint_name}.bigWig"]) - - - - - - + _result = subprocess.run( + [ + "bedGraphToBigWig", + bedgraph_tmp.name, + chromsizes_tmp.name, + f"{output_prefix}_{viewpoint_name}.bigWig", + ] + ) diff --git a/capcruncher/cli/interactions_store.py b/capcruncher/cli/interactions_store.py new file mode 100644 index 00000000..ee2a197a --- /dev/null +++ b/capcruncher/cli/interactions_store.py @@ -0,0 +1,165 @@ +from loguru import logger +import os +import tempfile +from typing import Tuple +import pandas as pd +import ray +from capcruncher.api.storage import ( + CoolerBinner, + create_cooler_cc, + merge_coolers, +) +import cooler +import warnings + +# warnings.filterwarnings("ignore", category=DeprecationWarning) +# warnings.filterwarnings("ignore", category=FutureWarning) + + +def fragments( + counts: os.PathLike, + fragment_map: os.PathLike, + output: os.PathLike, + viewpoint_path: os.PathLike, + viewpoint_name: str = "", + genome: str = "", + suffix: str = "", +): + """ + Stores restriction fragment interaction combinations at the restriction fragment level. + + Parses reporter restriction fragment interaction counts produced by + "capcruncher reporters count" and gerates a cooler formatted group in an HDF5 File. + See `https://cooler.readthedocs.io/en/latest/` for further details. + + + \f + Args: + counts (os.PathLike): Path to restriction fragment interactions counts .tsv file. + fragment_map (os.PathLike): Path to restriction fragment .bed file, generated with genome-digest command. + output (os.PathLike): Output file path for cooler hdf5 file. + viewpoint_name (str): Name of viewpoint. + viewpoint_path (os.PathLike): Path to viewpoints bed file. + genome (str, optional): Name of genome used for alignment e.g. hg19. Defaults to "". + suffix (str, optional): Suffix to append to filename. Defaults to "". + """ + # Load restriction fragments + df_restriction_fragment_map = pd.read_csv( + fragment_map, + sep="\t", + header=None, + names=["chrom", "start", "end", "name"], + ) + + # Load counts + if counts.endswith(".hdf5"): + + with pd.HDFStore(counts) as store: + + if not viewpoint_name: + viewpoints = {k.split("/")[1] for k in store.keys()} + else: + viewpoints = { + viewpoint_name, + } + + for viewpoint in viewpoints: + df_counts = store[viewpoint] + + create_cooler_cc( + output, + bins=df_restriction_fragment_map, + pixels=df_counts, + viewpoint_name=viewpoint, + viewpoint_path=viewpoint_path, + assembly=genome, + suffix=suffix, + ) + + else: + df_counts = pd.read_csv(counts, sep="\t") + # Create cooler file at restriction fragment resolution + create_cooler_cc( + output, + bins=df_restriction_fragment_map, + pixels=df_counts, + viewpoint_name=viewpoint_name, + viewpoint_path=viewpoint_path, + assembly=genome, + suffix=suffix, + ) + + +@ray.remote(num_cpus=1) +def _bin_cooler(clr_in: os.PathLike, clr_out: os.PathLike, binsize: int, **kwargs): + + clr_binner = CoolerBinner( + cooler_group=clr_in, + binsize=binsize, + **kwargs, + ) + clr_binner.to_cooler(clr_out) + return clr_out + + +def bins( + cooler_path: os.PathLike, + output: os.PathLike, + binsizes: Tuple = None, + normalise: bool = True, + scale_factor: int = 1e6, + overlap_fraction: float = 1e-9, + conversion_tables: os.PathLike = None, + n_cores: int = 1, + **kwargs, +): + """ + Convert a cooler group containing restriction fragments to constant genomic windows + + Parses a cooler group and aggregates restriction fragment interaction counts into + genomic bins of a specified size. If the normalise option is selected, + columns containing normalised counts are added to the pixels table of the output + + \f + Args: + cooler_path (os.PathLike): Path to cooler file. + output (os.PathLike): Path to output cooler file. + binsizes (Tuple, optional): Binsizes to bin cooler file to. Defaults to None. + normalise (bool, optional): Whether to normalise counts. Defaults to True. + scale_factor (int, optional): Scale factor for normalisation. Defaults to 1e6. + overlap_fraction (float, optional): Minimum overlap fraction for binning. Defaults to 1e-9. + conversion_tables (os.PathLike, optional): Path to conversion tables. Defaults to None. + n_cores (int, optional): Number of cores to use. Defaults to 1. + + """ + clr_groups = cooler.api.list_coolers(cooler_path) + + assert clr_groups, "No cooler groups found in file" + assert binsizes, "No binsizes provided" + + ray.init(num_cpus=n_cores, ignore_reinit_error=True) + clr_tempfiles = [] + + for binsize in binsizes: + for clr_group in clr_groups: + + logger.info(f"Processing {clr_group}") + clr_in = cooler.Cooler(f"{cooler_path}::{clr_group}") + clr_out = tempfile.NamedTemporaryFile().name + + # TODO: Integrate these ino the CLI + default_kwargs = dict( + method="midpoint", + minimum_overlap=0.51, + n_cis_interaction_correction=True, + n_rf_per_bin_correction=True, + scale_factor=1_000_000, + ) + + clr_tempfiles.append( + _bin_cooler.remote(clr_in, clr_out, binsize, **default_kwargs) + ) + + # Final cooler output + clr_tempfiles = ray.get(clr_tempfiles) + merge_coolers(clr_tempfiles, output) diff --git a/capcruncher/cli/plot.py b/capcruncher/cli/plot.py index 2c729538..1ee0ecf6 100644 --- a/capcruncher/cli/plot.py +++ b/capcruncher/cli/plot.py @@ -1,205 +1,112 @@ import os +import pathlib import sys -from collections import namedtuple, OrderedDict -import matplotlib +import numpy as np import pandas as pd -import yaml -import seaborn as sns - -import coolbox.api as cb -from capcruncher.tools.plotting import ( - ScaleBar, - SimpleBed, - CCBigWig, - CCBigWigCollection, - CCMatrix, - XAxisGenomic, -) - - -def make_template( - files: list, - output_prefix: str, - design_matrix=None, - analysis_method="tiled", - viewpoint=None, - binsize=None, -): - - bed = namedtuple( - "Bed", field_names=["file", "color", "type"], defaults=["black", "bed"] - ) - bw = namedtuple( - "BigWig", - field_names=["file", "color", "type", "style", "min_value", "max_value"], - defaults=["blue", "bigWig", "fragment", "auto", "auto"], - ) - bwc = namedtuple( - "BigWigCollection", - field_names=[ - "file", - "color", - "type", - "smooth_window", - "min_value", - "max_value", - ], - defaults=["blue", "bigWigCollection", 2001, "auto", "auto"], - ) - heatmap = namedtuple( - "Heatmap", - field_names=[ - "file", - "viewpoint", - "binsize", - "assay", - "normalization", - "remove_viewpoint", - "type", - "color", - "style", - "min_value", - "max_value", - "transform", - "orientation", - ], - defaults=[ - viewpoint if viewpoint else "VIEWPOINT", - int(binsize) if binsize else "BINSIZE", - analysis_method, - "icen", - False, - "heatmap", - "jet", - "triangular", - "auto", - "auto", - "no", - "normal", - ], - ) - genes = namedtuple( - "genes", - field_names=["file", "type", "style", "gene_style", "color", "display", "fontsize"], - defaults=["genes", "gene", "normal", "bed_rgb", "stacked", 7.5], - ) - - extensions_to_track_mapping = { - "genes.bed": genes, - ".bed": bed, - ".bigWig": bw, - ".hdf5": heatmap, - } - - tracks = OrderedDict() - processed_files = set() - - # Find any gene specifiers as these need to go first - for fn in files: - if "genes.bed" in fn: - tracks[os.path.basename(fn)] = genes(file=fn) - processed_files.add(fn) - - # Deal with any files that need to be grouped together to form a summary dataframe - if design_matrix: - - # Assuming design matrix has columns: sample condition - df_design = pd.read_csv(design_matrix, sep=r",|\s+|\t", index_col="sample", engine="python") - df_fnames = ( - pd.Series(files).loc[lambda ser: ser.str.contains(".bigWig")].to_frame("fn") - ) - - # Extract sample name, normalisation and viewpoint from each file name - df_fnames["basename"] = df_fnames["fn"].apply(os.path.basename) - df_fnames = df_fnames.join( - df_fnames["basename"].str.extract( - r"^(?P.*?)\.(?P.*?)\.(?P.*?)\.bigWig$" - ) - ) - df_fnames = df_fnames.set_index("samplename") - df_fnames = df_fnames.join(df_design["condition"]) - - # Need to ignore any subtraction bigWig files as these will interfere - df_fnames = df_fnames.loc[ - lambda df: ~df["normalisation"].str.contains("-subtraction") - ] - - # Get random colors for each plot - colors = [ - matplotlib.colors.to_hex(c) for c in sns.palettes.hls_palette(n_colors=12) - ] - - # Make bigwig collection and append to template - for ((condition, viewpoint), df), color in zip( - df_fnames.groupby(["condition", "viewpoint"]), colors - ): - fnames = df["fn"].to_list() - processed_files.update(fnames) - tracks[f"{condition}_{viewpoint}"] = bwc(file=fnames, color=color) - - # Deal with the rest of the files - for fn in files: - - if not fn in processed_files: - - fn_base = os.path.basename(fn).replace(".gz", "") - fn_no_ext, ext = os.path.splitext(fn_base) - track_type = extensions_to_track_mapping.get(ext, None) - - if track_type: - tracks[fn_no_ext] = track_type(**{"file": fn}) - - else: - raise ValueError(f"Track extension {ext} not supported at the moment") - - tracks_for_output = {k: v._asdict() for k, v in tracks.items()} - with open(f"{output_prefix}.yml", "w") as w: - yaml.dump(tracks_for_output, w, sort_keys=False) - - -def plot_reporters( - region: str, config: os.PathLike, output: str, x_axis=False, no_scale_bar=False, -): - - track_type_to_track_class_mapping = { - "bigWig": CCBigWig, - "bigWigCollection": CCBigWigCollection, - "heatmap": CCMatrix, - "bed": SimpleBed, - "genes": cb.BED, - } - - chrom = region.split(":")[0] - start, end = [int(x) for x in region.split(":")[1].replace(",", "").split("-")] - - with open(config, "r") as r: - tracks = yaml.safe_load(r) - - # Perform the plotting - frame = cb.Frame() - - if not no_scale_bar: - frame.add_track(ScaleBar()) - else: - # Just add a spacer if no scale bar - frame.add_track(cb.Spacer()) - - for ii, (track_name, track_details) in enumerate(tracks.items()): - - track_type = track_details["type"] - track_class = track_type_to_track_class_mapping[track_type] - - if track_class is cb.BED: - # Don't give the bed file a title - track = track_class(**track_details, title=" ") - else: - track = track_class(**track_details, title=track_name) - - frame.add_track(track) - frame.add_track(cb.Spacer()) - - if x_axis: - frame.add_track(XAxisGenomic()) - - figure = frame.plot(chrom, start, end) - figure.savefig(output) +import matplotlib.pyplot as plt + +import capcruncher.api.plotting as cp +from loguru import logger + +# def make_template( +# files: list, +# output_prefix: str, +# design_matrix=None, +# analysis_method="tiled", +# viewpoint=None, +# binsize=None, +# genes=None, +# ): + + +# tracks = [] +# tracks.append(cp.CCTrack(None, type="scale")) + +# if analysis_method != "tiled" and genes: +# tracks.append(cp.CCTrack(genes, type="genes")) + + +# ## Design matrix + +# # If a design matrix is provided, we need to group the files together +# if design_matrix: +# df_design = pd.read_csv( +# design_matrix, sep=r",|\s+|\t", index_col="sample", engine="python" +# ) + + +# # Deal with any files that need to be grouped together to form a summary dataframe +# if design_matrix: + +# # Assuming design matrix has columns: sample condition +# df_design = pd.read_csv( +# design_matrix, sep=r",|\s+|\t", index_col="sample", engine="python" +# ) +# df_fnames = ( +# pd.Series(files).loc[lambda ser: ser.str.contains(".bigWig")].to_frame("fn") +# ) + +# # Extract sample name, normalisation and viewpoint from each file name +# df_fnames["basename"] = df_fnames["fn"].apply(os.path.basename) +# df_fnames = df_fnames.join( +# df_fnames["basename"].str.extract( +# r"^(?P.*?)\.(?P.*?)\.(?P.*?)\.bigWig$" +# ) +# ) +# df_fnames = df_fnames.set_index("samplename") +# df_fnames = df_fnames.join(df_design["condition"]) + +# # Need to ignore any subtraction bigWig files as these will interfere +# df_fnames = df_fnames.loc[ +# lambda df: ~df["normalisation"].str.contains("-subtraction") +# ] + +# # Get random colors for each plot +# colors = [ +# matplotlib.colors.to_hex(c) for c in sns.palettes.hls_palette(n_colors=12) +# ] + +# # Make bigwig collection and append to template +# for ((condition, viewpoint), df), color in zip( +# df_fnames.groupby(["condition", "viewpoint"]), colors +# ): +# fnames = df["fn"].to_list() +# processed_files.update(fnames) +# tracks[f"{condition}_{viewpoint}"] = bwc(file=fnames, color=color) + +# # Deal with the rest of the files +# for fn in files: + +# if fn not in processed_files: + +# fn_base = os.path.basename(fn).replace(".gz", "") +# fn_no_ext, ext = os.path.splitext(fn_base) +# track_type = extensions_to_track_mapping.get(ext, None) + +# if track_type: +# tracks[fn_no_ext] = track_type(**{"file": fn}) + +# else: +# raise ValueError(f"Track extension {ext} not supported at the moment") + +# tracks_for_output = {k: v._asdict() for k, v in tracks.items()} +# with open(f"{output_prefix}.yml", "w") as w: +# yaml.dump(tracks_for_output, w, sort_keys=False) + + +def plot( + region: str, + template: os.PathLike, + output: str, +) -> None: + """Plot a region using a template. + + Args: + region (str): Genomic region to plot. + template (os.PathLike): Path to template file. + output (str): Path to output file. + + """ + + fig = cp.CCFigure.from_toml(template) + fig.save(region, output=output) diff --git a/capcruncher/cli/reporters_count.py b/capcruncher/cli/reporters_count.py deleted file mode 100644 index ede18904..00000000 --- a/capcruncher/cli/reporters_count.py +++ /dev/null @@ -1,231 +0,0 @@ -from concurrent.futures import thread -import xopen -import os -import logging -import tempfile -import glob -import more_itertools - -import pandas as pd -from capcruncher.tools.io import ( - CCHDF5ReaderProcess, - CCParquetReaderProcess, - FragmentCountingProcess, - CCCountsWriterProcess, -) -import capcruncher.cli.reporters_store -import capcruncher.tools.storage -from capcruncher.tools.count import get_counts_from_tsv, get_counts_from_tsv_by_batch -from capcruncher.utils import get_categories_from_hdf5_column, get_file_type - -def get_number_of_reader_threads(n_cores): - threads = (n_cores // 2) - - if 1 < threads < 4: - threads = threads - elif threads < 1: - threads = 1 - else: - threads = 4 - - return threads - - - - -def count( - reporters: os.PathLike, - output: os.PathLike = "counts.tsv", - file_type: str = "auto", - remove_exclusions: bool = False, - remove_capture: bool = False, - subsample: int = 0, - low_memory: bool = False, - chunksize: int = 2e6, - output_as_cooler: bool = False, - fragment_map: os.PathLike = None, - viewpoint_path: os.PathLike = None, - n_cores: int = 1, -): - """ - Determines the number of captured restriction fragment interactions genome wide. - - Parses a reporter slices tsv and counts the number of unique restriction fragment - interaction combinations that occur within each fragment. - - Options to ignore unwanted counts e.g. excluded regions or capture fragments are provided. - In addition the number of reporter fragments can be subsampled if required. - - \f - Args: - reporters (os.PathLike): Reporter tsv file path. - output (os.PathLike, optional): Output file path for interaction counts tsv. Defaults to 'counts.tsv'. - remove_exclusions (bool, optional): Removes regions marked as capture proximity exclusions before counting. Defaults to False. - remove_capture (bool, optional): Removes all capture fragments before counting. Defaults to False. - subsample (int, optional): Subsamples the fragments by the specified fraction. Defaults to 0 i.e. No subsampling. - """ - - input_file_type = get_file_type(reporters) if file_type == "auto" else file_type - output_file_type = get_file_type(output) - - if output_file_type != "hdf5" and not output_as_cooler: - raise NotImplementedError("Currently only cooler output supported") - - - if output_file_type == "tsv": - with xopen.xopen(output, mode="wb", threads=4) as writer: - - # Write output file header. - header = "\t".join(["bin1_id", "bin2_id", "count"]) + "\n" - writer.write(header.encode()) - - if input_file_type == "tsv": - - if low_memory: - counts = get_counts_from_tsv_by_batch( - reporters=reporters, - chunksize=chunksize, - remove_capture=remove_capture, - remove_exclusions=remove_exclusions, - subsample=subsample, - ) - else: - counts = get_counts_from_tsv( - reporters=reporters, - remove_capture=remove_capture, - remove_exclusions=remove_exclusions, - subsample=subsample, - ) - - for (rf1, rf2), count in counts.items(): - line = "\t".join([str(rf1), str(rf2), str(count)]) + "\n" - writer.write(line.encode()) - else: - raise NotImplementedError("Currently only tsv -> tsv supported") - - elif output_file_type == "hdf5": - - import multiprocessing - - viewpoints_queue = multiprocessing.Queue() - slices_queue = multiprocessing.Queue() - counts_queue = multiprocessing.Queue() - - if input_file_type == "hdf5": - viewpoints = get_categories_from_hdf5_column( - reporters, "slices", "viewpoint" - ) - reader = CCHDF5ReaderProcess( - path=reporters, key="slices", inq=viewpoints_queue, outq=slices_queue - ) - - elif input_file_type == "parquet": - import pyarrow - import dask.dataframe as dd - - logging.info(f"Examining viewpoints from parquet file: {reporters}") - # Unsure of the best way to do this. Will just load the first partion vp column and extract - ddf = dd.read_parquet( - reporters, - columns=[ - "viewpoint", - ], - engine="pyarrow" - ) - - viewpoints_col = ddf["viewpoint"].cat.as_known() - viewpoint_sizes = viewpoints_col.value_counts().compute() - viewpoints = list(viewpoints_col.cat.categories) - - logging.info(f"Number of viewpoints: {len(viewpoints)}") - logging.info(f"Number of slices per viewpoint: {viewpoint_sizes.to_dict()}") - - MAX_SLICE_NUMBER = 1e6 - - if ( - viewpoint_sizes[viewpoint_sizes > MAX_SLICE_NUMBER].shape[0] == 0 - ): # Less than MAX_SLICE_NUMBER slices, read in batch - mode = "batch" - else: - # More than MAX_SLICE_NUMBER slices, read by partition - mode = "partition" - - reader = CCParquetReaderProcess( - path=reporters, - inq=viewpoints_queue, - outq=slices_queue, - selection_mode=mode, - ) - - # Multiprocessing set-up - n_reading_threads = get_number_of_reader_threads(n_cores=n_cores) - pyarrow.set_cpu_count(n_reading_threads) - - n_worker_processes = ((n_cores - n_reading_threads) // 2) - n_counting_processes = n_worker_processes if n_worker_processes > 1 else 1 - n_writing_processes = n_counting_processes - - logging.info(f"Starting {n_reading_threads} reader threads") - - counters = [ - FragmentCountingProcess( - inq=slices_queue, - outq=counts_queue, - remove_capture=remove_capture, - remove_exclusions=remove_exclusions, - subsample=subsample, - ) - for i in range(n_counting_processes) - ] - - tmpdir = tempfile.TemporaryDirectory() - writers = [ - CCCountsWriterProcess( - inq=counts_queue, - output_format="cooler" if output_as_cooler else file_type, - restriction_fragment_map=fragment_map, - viewpoint_path=viewpoint_path, - tmpdir=tmpdir.name, - ) - for i in range(n_writing_processes) - ] - - # Start all processes - processes = [*writers, *counters, reader] - for process in processes: - process.start() - - # Add all viewpoints to queue - if input_file_type == "hdf5": # Not updated this to cope with batches - for vp in viewpoints: - viewpoints_queue.put(vp) - elif input_file_type == "parquet": - for vp_chunk in more_itertools.chunked(viewpoints, 10): - viewpoints_queue.put(vp_chunk) - - viewpoints_queue.put(None) # Sentinel value to signal end of viewpoints - reader.join() - - # End the counting inqueue - for _ in range(n_cores): - slices_queue.put((None, None)) - - # Join the counters - for counter in counters: - counter.join() - - # End the counting queue - for _ in range(n_cores): - counts_queue.put((None, None)) - - # Join the writers - for writer in writers: - writer.join() - - # Merge the output files together - # TODO: Allow other than cooler outputs - logging.info(f"Making final cooler at {output}") - output_files = glob.glob(os.path.join(tmpdir.name, "*.hdf5")) - capcruncher.cli.reporters_store.merge(output_files, output=output) - - tmpdir.cleanup() diff --git a/capcruncher/cli/reporters_differential.py b/capcruncher/cli/reporters_differential.py deleted file mode 100644 index ef4ee250..00000000 --- a/capcruncher/cli/reporters_differential.py +++ /dev/null @@ -1,120 +0,0 @@ -import os -import pandas as pd -import itertools -import numpy as np - - -def get_chromosome_from_name(df: pd.DataFrame, name: str): - chrom = (df.query(f'name == "{name}"') - ['chrom'] - .iloc[0]) - return chrom - - -def differential(union_bedgraph: os.PathLike, - capture_name: str, - capture_viewpoints: os.PathLike, - output_prefix: os.PathLike = 'differential', - design_matrix: os.PathLike = None, - grouping_col: str = 'condition', - threshold_count: float = 20, - threshold_q: float = 0.05, - threshold_mean: float = 0): - - """ - Identifies differential interactions between conditions. - - Parses a union bedgraph containg reporter counts from at least two conditions with - two or more replicates for a single capture probe and outputs differential interaction - results. Following filtering to ensure that the number of interactions is above the required - threshold (--threshold_count), diffxpy is used to run a wald test after - fitting a negative binomial model to the interaction counts.The options to filter - results can be filtered by a minimum mean value (threshold_mean) and/or - maximum q-value (threshold-q) are also provided. - - Notes: - - Currently both the capture oligos and the name of the probe being analysed must - be provided in order to correctly extract cis interactions. - - If a N_SAMPLE * METADATA design matrix has not been supplied, the script - assumes that the standard replicate naming structure has been followed - i.e. SAMPLE_CONDITION_REPLICATE_(1|2).fastq.gz. - - \f - Args: - union_bedgraph (os.PathLike): Union bedgraph containg all samples to be compared. - capture_name (str): Name of capture probe. MUST match one probe within the supplied oligos. - capture_viewpoints (os.PathLike): Capture oligos used for the analysis. - output_prefix (os.PathLike, optional): Output prefix for differntial interactions. Defaults to 'differential'. - design_matrix (os.PathLike, optional): Design matrix to use for grouping samples. (N_SAMPLES * METADATA). Defaults to None. - grouping_col (str, optional): Column to use for grouping. Defaults to 'condition'. - threshold_count (float, optional): Minimum number of reported interactions required. Defaults to 20. - threshold_q (float, optional): Maximum q-value for output. Defaults to 0.05. - threshold_mean (float, optional): Minimum mean value for output. Defaults to 0. - """ - - import diffxpy.api as de - - df_bdg = pd.read_csv(union_bedgraph, sep='\t') - df_viewpoints = pd.read_csv(capture_viewpoints, sep='\t', names=['chrom', 'start', 'end', 'name']) - - # If design matrix present then use it. Else will assume that the standard format has been followed: - # i.e. NAME_TREATMENT_REPLICATE - if design_matrix: - df_design = pd.read_csv(design_matrix, sep='\t') - else: - col_dict = {col: '_'.join(col.split('_')[:-1]) for col in df_bdg.columns[3:]} - df_design = pd.Series(col_dict).to_frame(grouping_col) - - - # Only cis interactions - capture_chrom = get_chromosome_from_name(df_viewpoints, name=capture_name) - df_bdg_counts = df_bdg.query(f'chrom == "{capture_chrom}"') - - # Only counts - df_bdg_counts = df_bdg_counts.iloc[:, 3:] - - # Only with number of interactions > threshold per group in at least 2 replicates - df_bdg_counts = (df_bdg_counts.groupby(df_design[grouping_col], axis=1) - .apply(lambda df: df[(df >= threshold_count).sum(axis=1) >= 2]) - .fillna(0.0)) - - # Run differential testing - count_data = df_bdg_counts.transpose().values.astype(np.float64) - fragment_names = df_bdg_counts.index.values - - tests = de.test.pairwise(count_data, - grouping=grouping_col, - sample_description=df_design, - gene_names=fragment_names, - test='wald', - lazy=False, - backend='numpy') - - # Go through all of the pairwise tests - for g1, g2 in itertools.combinations(tests.groups, 2): - df_comparison = tests.summary_pairs(groups0=[g1,], - groups1=[g2,], - qval_thres=threshold_q, - mean_thres=threshold_mean) - - # Extract the fragment coords - df_coords = df_bdg.loc[df_comparison['gene'], ['chrom', 'start', 'end']] - # Merge with test results - df_comparison = df_comparison.merge(df_coords, left_on='gene', right_index=True) - # Output to tsv - (df_comparison.drop(columns='gene') - [['chrom', 'start', 'end', 'mean', 'log2fc', 'pval', 'qval']] - .to_csv(f'{output_prefix}_{g1}_vs_{g2}.tsv', sep='\t', index=False)) - - - -# if __name__ == '__main__': -# interactions_differential() - - - - - - diff --git a/capcruncher/cli/reporters_store.py b/capcruncher/cli/reporters_store.py deleted file mode 100644 index bbf93f9f..00000000 --- a/capcruncher/cli/reporters_store.py +++ /dev/null @@ -1,281 +0,0 @@ -import logging -import os -import pickle -import tempfile -from typing import Iterable, Tuple - -import h5py -import pandas as pd -from capcruncher.tools.storage import ( - CoolerBinner, - GenomicBinner, - create_cooler_cc, - link_common_cooler_tables, -) -import cooler -import ujson - -def get_merged_metadata(coolers: Iterable[os.PathLike]): - """ - Merges metadata from multiple coolers. - """ - # Get metadata from all coolers and copy to the merged file - metadata = {} - for cooler_uri in coolers: - filepath, group = cooler_uri.split("::") - - with h5py.File(filepath, mode="r") as src: - metadata_src = ujson.decode(src[group].attrs["metadata"]) - - for metadata_key, metadata_value in metadata_src.items(): - - if isinstance(metadata_value, str): - metadata[metadata_key] = metadata_value - - elif isinstance(metadata_value, Iterable): - if metadata_key not in metadata: - metadata[metadata_key] = [] - metadata[metadata_key].extend(metadata_value) - else: - metadata[metadata_key].extend( - [ - v - for v in metadata_value - if v not in metadata[metadata_key] - ] - ) - - elif isinstance(metadata_value, (int, float)): - if metadata_key not in metadata: - metadata[metadata_key] = metadata_value - else: - metadata[metadata_key] += metadata_value - - return metadata - - -def fragments( - counts: os.PathLike, - fragment_map: os.PathLike, - output: os.PathLike, - viewpoint_path: os.PathLike, - viewpoint_name: str = "", - genome: str = "", - suffix: str = "", -): - """ - Stores restriction fragment interaction combinations at the restriction fragment level. - - Parses reporter restriction fragment interaction counts produced by - "capcruncher reporters count" and gerates a cooler formatted group in an HDF5 File. - See `https://cooler.readthedocs.io/en/latest/` for further details. - - - \f - Args: - counts (os.PathLike): Path to restriction fragment interactions counts .tsv file. - fragment_map (os.PathLike): Path to restriction fragment .bed file, generated with genome-digest command. - output (os.PathLike): Output file path for cooler hdf5 file. - viewpoint_name (str): Name of viewpoint. - viewpoint_path (os.PathLike): Path to viewpoints bed file. - genome (str, optional): Name of genome used for alignment e.g. hg19. Defaults to "". - suffix (str, optional): Suffix to append to filename. Defaults to "". - """ - # Load restriction fragments - df_restriction_fragment_map = pd.read_csv( - fragment_map, - sep="\t", - header=None, - names=["chrom", "start", "end", "name"], - ) - - # Load counts - if counts.endswith(".hdf5"): - - with pd.HDFStore(counts) as store: - - if not viewpoint_name: - viewpoints = {k.split("/")[1] for k in store.keys()} - else: - viewpoints = { - viewpoint_name, - } - - for viewpoint in viewpoints: - df_counts = store[viewpoint] - - create_cooler_cc( - output, - bins=df_restriction_fragment_map, - pixels=df_counts, - viewpoint_name=viewpoint, - viewpoint_path=viewpoint_path, - assembly=genome, - suffix=suffix, - ) - - else: - df_counts = pd.read_csv(counts, sep="\t") - # Create cooler file at restriction fragment resolution - create_cooler_cc( - output, - bins=df_restriction_fragment_map, - pixels=df_counts, - viewpoint_name=viewpoint_name, - viewpoint_path=viewpoint_path, - assembly=genome, - suffix=suffix, - ) - - -def bins( - cooler_path: os.PathLike, - output: os.PathLike, - binsizes: Tuple = None, - normalise: bool = True, - scale_factor: int = 1e6, - overlap_fraction: float = 1e-9, - conversion_tables: os.PathLike = None, - **kwargs, -): - """ - Convert a cooler group containing restriction fragments to constant genomic windows - - Parses a cooler group and aggregates restriction fragment interaction counts into - genomic bins of a specified size. If the normalise option is selected, - columns containing normalised counts are added to the pixels table of the output - - - Notes: - To avoid repeatedly calculating restriction fragment to bin conversions, - bin conversion tables (a .pkl file containing a dictionary of - `:class:capcruncher.tools.storage.GenomicBinner` objects, one per binsize) can be supplied. - - - Args: - cooler_fn (os.PathLike): Path to cooler file. Nested coolers can be specified by STORE_FN.hdf5::/PATH_TO_COOLER - output (os.PathLike): Path for output binned cooler file. - binsizes (Tuple, optional): Genomic window sizes to use for binning. Defaults to None. - normalise (bool, optional): Normalise the number of interactions to total number of cis interactions (True). Defaults to False. - n_cores (int, optional): Number of cores to use for binning. Performed in parallel by chromosome. Defaults to 1. - scale_factor (int, optional): Scaling factor to use for normalising interactions. Defaults to 1e6. - overlap_fraction (float, optional): Minimum fraction to use for defining overlapping bins. Defaults to 1e-9. - """ - clr_groups = cooler.api.list_coolers(cooler_path) - - if conversion_tables: - logging.info("Loading conversion tables") - with open(conversion_tables, "rb") as f: - try: - conversion_tables = pickle.load(f) - except: - raise ValueError("Conversion tables are not in a valid format.") - else: - logging.info("Generating conversion tables") - clr_example = cooler.Cooler(f"{cooler_path}::{clr_groups[0]}") - conversion_tables = { - binsize: GenomicBinner( - chromsizes=clr_example.chromsizes, - fragments=clr_example.bins()[:], - binsize=binsize, - ) - for binsize in binsizes - } - - clr_tempfiles = [] - for binsize in binsizes: - for clr_group in clr_groups: - - binning_output = tempfile.NamedTemporaryFile().name - - logging.info(f"Processing {clr_group}") - clr = cooler.Cooler(f"{cooler_path}::{clr_group}") - clr_binner = CoolerBinner( - cooler_group=clr, binner=conversion_tables[binsize] - ) - - if normalise: - clr_binner.normalise(scale_factor=scale_factor) - - clr_binner.to_cooler(binning_output) - clr_tempfiles.append(binning_output) - - # Final cooler output - merge(clr_tempfiles, output) - - -def merge(coolers: Tuple, output: os.PathLike): - """ - Merges capcruncher cooler files together. - - Produces a unified cooler with both restriction fragment and genomic bins whilst - reducing the storage space required by hard linking the "bins" tables to prevent duplication. - - Args: - coolers (Tuple): Cooler files produced by either the fragments or bins subcommands. - output (os.PathLike): Path from merged cooler file. - """ - from collections import defaultdict - import cooler - - logging.info("Merging cooler files") - - coolers_to_merge = defaultdict(list) - - # Remove output file as need to append to it. - if os.path.exists(output): - os.unlink(output) - - # Extract a list of coolers to merge, grouped by viewpoint name - for clr in coolers: - with h5py.File(clr, mode="r") as src: - viewpoints = list(src.keys()) - - for viewpoint in viewpoints: - if not "resolutions" in list(src[viewpoint].keys()): - coolers_to_merge[viewpoint].append(f"{clr}::/{viewpoint}") - else: - for resolution in src[viewpoint]["resolutions"].keys(): - coolers_to_merge[f"{viewpoint}::{resolution}"].append( - f"{clr}::/{viewpoint}/resolutions/{resolution}" - ) - - # Initial pass to perform copying for all coolers without a matching group - need_merging = list() - with h5py.File(output, mode="w") as dest: - for ii, (viewpoint, cooler_uris) in enumerate(coolers_to_merge.items()): - - if len(cooler_uris) < 2: # Only merge if two or more, else just copy - (file_path, group_path) = cooler_uris[0].split("::") - - with h5py.File(file_path, mode="r") as src: - src.copy(src[group_path], dest, group_path) - - else: - need_merging.append(viewpoint) - - # Actually merge the coolers left over that do have duplicates - for viewpoint in need_merging: - tmp = tempfile.NamedTemporaryFile().name - cooler_uris = coolers_to_merge[viewpoint] - cooler.merge_coolers( - f"{tmp}::/{viewpoint.replace('::', '/resolutions/')}", - cooler_uris, - mergebuf=int(1e6), - ) - - with h5py.File(tmp, mode="r") as src: - with h5py.File(output, mode="a") as dest: - dest.copy( - src[viewpoint.replace("::", "/resolutions/")], dest, viewpoint - ) - - metadata = get_merged_metadata(cooler_uris) - - with h5py.File(output, mode="a") as dest: - dest[viewpoint.replace("::", "/resolutions/")].attrs[ - "metadata" - ] = ujson.encode(metadata) - - # Reduce space by linking common tables (bins, chroms) - link_common_cooler_tables(output) diff --git a/capcruncher/data/logo.txt b/capcruncher/data/logo.txt new file mode 100644 index 00000000..ba17f781 --- /dev/null +++ b/capcruncher/data/logo.txt @@ -0,0 +1,23 @@ + + /$$$$$$ /$$$$$$ /$$ + /$$__ $$ /$$__ $$ | $$ +| $$ \__/ /$$$$$$ /$$$$$$ | $$ \__/ /$$$$$$ /$$ /$$ /$$$$$$$ /$$$$$$$| $$$$$$$ /$$$$$$ /$$$$$$ +| $$ |____ $$ /$$__ $$| $$ /$$__ $$| $$ | $$| $$__ $$ /$$_____/| $$__ $$ /$$__ $$ /$$__ $$ +| $$ /$$$$$$$| $$ \ $$| $$ | $$ \__/| $$ | $$| $$ \ $$| $$ | $$ \ $$| $$$$$$$$| $$ \__/ +| $$ $$ /$$__ $$| $$ | $$| $$ $$| $$ | $$ | $$| $$ | $$| $$ | $$ | $$| $$_____/| $$ +| $$$$$$/| $$$$$$$| $$$$$$$/| $$$$$$/| $$ | $$$$$$/| $$ | $$| $$$$$$$| $$ | $$| $$$$$$$| $$ + \__/$$_/ \_______/| $$____/ \______/ |__/ /$$$_/$$$_/ |__/ |__/ \_______/|__/ |__/ \_______/|__/ + /$$/ | $$ /$$_/|_ $$ + /$$/ | $$ /$$/ \ $$ + /$$/ /$$$$$$ /$$$$$$_/$$$$$$ /$$$$$$ /$$$$$$| $$ | $$ +| $$|______/|______/|______/|______/|______/| $$ | $$ + \ $$ | $$ /$$/ + \ $$ \ $$$ /$$$/ + \__/ \___/|/$$//$$ + /$$//$$/ + /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$//$$//$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ +|____/|____/|____/|____/|____/|____/|____/|____/ /$$//$$/|____/|____/|____/|____/|____/|____/|____/|____/|____/ + /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$//$$/ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ /$$$$ +|____/|____/|____/|____/|____/|____/|____/|____//$$//$$/ |____/|____/|____/|____/|____/|____/|____/|____/|____/ + /$$//$$/ + |__/|__/ diff --git a/capcruncher/pipeline/__init__.py b/capcruncher/pipeline/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/capcruncher/pipeline/config/cookiecutter.json b/capcruncher/pipeline/config/cookiecutter.json new file mode 100644 index 00000000..c1e9a943 --- /dev/null +++ b/capcruncher/pipeline/config/cookiecutter.json @@ -0,0 +1,33 @@ +{ "__version": 2.0, + "user_name": "Your Name", + "method": ["Capture-C", "Tri-C", "Tiled-C"], + "__assay": "{{cookiecutter.method.lower().split('-')[0]}}", + "date": "{% now 'utc', '%Y-%m-%d' %}", + "project_name": "Project name", + "__project_id": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}", + "design": "PATH TO DESIGN FILE (optional)", + "viewpoints": "PATH TO VIEWPOINTS FILE", + "genome": "hg38", + "is_custom_genome": ["no", "yes"], + "genome_organism": "Species name (optional)", + "genome_fasta": "PATH TO GENOME FASTA ", + "genome_chromosome_sizes": "PATH TO CHROMSOME SIZES", + "genome_indicies": "PATH TO BOWTIE2 INDICIES (include the prefix)", + "genome_two_bit": "PATH TO 2BIT FILE (ignore if not a custom genome)", + "restriction_enzyme": "dpnii", + "remove_blacklist": ["yes", "no"], + "blacklist": "PATH TO BLACKLIST FILE (ignore if not removing blacklist)", + "genomic_bin_size": 0, + "prioritize_cis_slices": ["yes", "no"], + "priority_chromosomes": ["None", "viewpoints"], + "make_ucsc_hub": ["yes", "no"], + "ucsc_hub_directory": "PATH FOR UCSC HUB DIRECTORY", + "ucsc_hub_name": "UCSC hub name", + "ucsc_hub_email": "Email address (UCSC required)", + "ucsc_track_color_by": ["none", "samplename", "method"], + "make_plots": ["yes", "no"], + "plotting_coordinates": "PATH TO PLOTTING COORDINATES", + "plotting_normalisation": ["raw", "n_interactions", "n_rf_n_interactions", "ice", "icen_cis"], + "differential_contrast": "Column name for differential contrast in design matrix (optional)" + + } diff --git a/capcruncher/pipeline/config/{{cookiecutter.date}}_{{cookiecutter.__project_id}}_{{cookiecutter.__assay}}/capcruncher_config.yml b/capcruncher/pipeline/config/{{cookiecutter.date}}_{{cookiecutter.__project_id}}_{{cookiecutter.__assay}}/capcruncher_config.yml new file mode 100644 index 00000000..6852e8b1 --- /dev/null +++ b/capcruncher/pipeline/config/{{cookiecutter.date}}_{{cookiecutter.__project_id}}_{{cookiecutter.__assay}}/capcruncher_config.yml @@ -0,0 +1,230 @@ +version: "{{cookiecutter.__version}}" + +################################### +# Essential configuration options # +################################### + +analysis: + + # Method to use for data analysis, choose from capture | tri | tiled. + # (Required) + method: "{{cookiecutter.__assay}}" + + # Path to viewpoints used for analysis. + # This is a bed file containing the coordinates of the captured restricion fragments. + # The file must be in four column bed format with the name in the last column. + # (Required) + viewpoints: "{{cookiecutter.viewpoints}}" + + # Restriction enzyme name or recognition site, *not* case sensitive + # e.g. DpnII | dpnii | GATC + # (Required) + restriction_enzyme: "{{cookiecutter.restriction_enzyme}}" + + # Number of basepairs to exclude around each capture probe to remove re-ligations + # (Required) + reporter_exclusion_zone: 1000 + + # Path to design matrix describing the experimental design. + # This must have two columns: sample condition + # e.g. sample condition + # SAMPLE1 DMSO + # SAMPLE2 DMSO + # If this is not provided, pattern matching will be used to determine the experimental design. + # In this case ensure that your FASTQ file names follow the pattern: SAMPLE-NAME-WITH-CONDITION_REPLICATE_[12].fastq(.gz). + # (Optional) + design: "{{cookiecutter.design}}" + + # Genomic window size(s) (use spaces to separate bin sizes) to use for binning restriction fragment interaction counts + # into even genomic windows. Only bin sizes that are present here will be allowed + # to be used for heatmap generation. + # (Optional) Leave blank to prevent binning + {% set genomic_bin_size = cookiecutter.genomic_bin_size.split(' ') %} + bin_sizes: + {%- for bin in genomic_bin_size %} + - {{bin}} + {%- endfor %} + +genome: + + # Name of genome. UCSC genome names are prefered. Custom names are accepted if chrom_sizes are provided + # (Required) + name: "{{cookiecutter.genome}}" + + # Path to fasta file containing entire genome sequence separated by chromosome. + # (Required) + fasta: "{{cookiecutter.genome_fasta}}" + + # Path to indicies for the specified aligner (default = bowtie2) + # Note: Do not include .Number|rev.bt2 + # e.g. /databank/igenomes/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/genome + # (Required) + aligner_index: "{{cookiecutter.genome_indicies}}" + + # Path to chromosome sizes for genome. + # If blank will be determined automatically from the genome (must be a UCSC genome) + # This should be a two column tsv file with columns: chromosome_name size + # FAI files can also be used. + # (Optional) + chrom_sizes: "{{cookiecutter.genome_chromosome_sizes}}" + + # Determines if this is a custom genome. Only needed for UCSC hub generation. + # (Optional) + custom: {%- if cookiecutter.is_custom_genome == "no" %} False {%- else %} True {%- endif %} + + # Organism name e.g. Mus Musculus. Only needed for UCSC hub using a custom genome. + # (Optional) + organism: "{{cookiecutter.genome_organism}}" + + # Path to twobit file for genome. Only needed for UCSC hub using a custom genome. + # (Optional) + twobit: "{{cookiecutter.genome_two_bit}}" + +hub: + + # Determines if hub is created or not. + # True|False + # (Required) + create: "{{cookiecutter.make_ucsc_hub}}" + + # Url/IP of server to host bigWigs + # e.g. http://userweb.molbiol.ox.ac.uk/ + # (Required for hub) + url: URL_TO_HOST_BIGWIGS + + # Location of publically accessible location on the server + # The directory will be created if it does not exist + # (Required for hub) + dir: "{{cookiecutter.ucsc_hub_directory}}" + + # Name for the hub + # (Required for hub) + name: "{{cookiecutter.ucsc_hub_name}}" + + # Short hub name + # (Optional for hub) + short: "{{cookiecutter.ucsc_hub_name}}" + + # Long hub name + # (Optional for hub) + long: "{{cookiecutter.ucsc_hub_name}}" + + # Email address + # (Required for hub) + email: "{{cookiecutter.ucsc_hub_email}}" + +################################### +# Optional configuration options # +################################### + +plot: + + # Determines if plots are created or not. + create: {%- if cookiecutter.make_plots == "no" %} False {%- else %} True {%- endif %} + + # Path to a bed file containing coordinates for regions to plot . + # Must be named and the interval name must contain the viewpoint to be plotted. + # e.g. chr1 1000 2000 VIEWPOINT-A_PLOT-NAME + # (Required for plotting) + coordinates: "{{cookiecutter.plotting_coordinates}}" + + # Normalisation method to use for the plot. Leave blank for default based on analysis method. + # Choose from: + # * raw - No normalisation + # * n_interactions - Normalised based on the number of cis interactions + # * n_rf_n_interactions - Normalised based on the number of cis interations and the number of restriction fragments per bin + # * ice - Iterative correction and eigenvector decomposition (ICE) normalisation. + # * icen - ICE normalisation followed by correction for the number of cis interactions. + # (Required for plotting) + normalisation: "{{cookiecutter.plotting_normalisation}}" + + # Plot genes using a bed12 formatted file (capcruncher utilities gtf_to_bed12 can be used to generate this). + # (Optional) + genes: PATH_TO_GENES_IN_BED12_FORMAT + + +align: + + # Aligner to use. Both bowtie and bowtie2 are supported but bowtie2 is prefered. + # (Required) + aligner: bowtie2 + + # Flag to specify index for the aligner. Leave blank if this is not present i.e. bowtie + # (Required) + index_flag: -x + + # Aligner specific options. (Take care as this will ignore the number of cores specified earlier). + # (Optional) + options: -p 6 --very-sensitive + +analysis_optional: + + # Path to a YAML file containing the filter order to be applied to slices. + # See https://github.com/sims-lab/CapCruncher/blob/master/capcruncher/data/test/ccslicefilter_test.yml + # for an example and the filter module of the documentation + # https://capcruncher.readthedocs.io/en/latest/capcruncher_module_documentation/capcruncher.tools.html#capcruncher-tools-filter-module + # for further details. + custom_filtering: PATH_TO_CUSTOM_FILTERING_CONFIG + + # Path to blacklisted regions bed file. Must be a four column named bed file. + # Can supply any regions to be removed. + # (Optional) + blacklist: "{{cookiecutter.blacklist}}" + + # Attempts to prevent cis slices from being removed during annotation + # (Optional) + prioritize_cis_slices: {%- if cookiecutter.is_custom_genome == "no" %} False {%- else %} True {%- endif %} + + # Attempts to prevent slices on specified chromosomes being removed during annotation + # Choose from: + # None - No priority + # Comma separated list e.g. chr1,chr2 to directly specify chromosomes + # viewpoints - Uses the supplied viewpoints to define priority chromosomes + # (Optional) + priority_chromosomes: None + + +normalisation: + + # Scaling factor for normalisation. + # Current normalisation formula: + # (REPORTER_COUNT / N_CIS_REPORTERS) * SCALE_FACTOR + # (Required) + scale_factor: 1000000 + + # Regions to use for normalising viewpoint pileups. + # To use this method, provide the path to a bed file with the inverval(s) + # to be used for normalising *each* viewpoint. All viewpoints must have at least + # one region specified. + regions: REGIONS_FOR_NORM + +trim: + + # Options passed to trim_galore + # (Optional) + options: --length 21 + +split: + + # Fastq files are split for parallel processing. Defines number of reads per fastq file (lower = more files to process) + # For Tiled-C or highly enriched assays it is advised to reduce this to 1x10^6 readpairs to speed up the analysis. + # The pipeline will cope with a high number of reads but it will run slower. + # (Optional) + n_reads: 1000000 + +compare: + # Method(s) to summarise replicate reporter counts for between sample comparisons. + # These must be provided as a comma or tab separated list of summary functions + # Note: The function name must be a numpy function + # e.g. mean,sum,median + # (Optional) + summary_methods: mean + +differential: + # Column in the design matrix to use for comparisons. + # (Optional) + contrast: {{cookiecutter.differential_contrast}} + + # Distance from the viewpoint to use for differential analysis. + # (Optional) + distance: 100000 diff --git a/capcruncher/pipeline/pipeline.py b/capcruncher/pipeline/pipeline.py deleted file mode 100644 index 581e8e05..00000000 --- a/capcruncher/pipeline/pipeline.py +++ /dev/null @@ -1,2542 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -This pipeline processes data from Capture-C/NG Capture-C/Tri-C and Tiled-C sequencing -protocols designed to identify 3D interactions in the genome from a specified viewpoint. - -It takes Illumina paired-end sequencing reads in fastq format -(gzip compression is prefered) as input and performs the following steps: - -1. Identifies all restriction fragments in the genome -2. Quality control of raw reads (fastqc, multiqc) -3. Splits fastqs into smaller files to enable fast parallel processing. -4. Removal of PCR duplicates based on exact sequence matches from fastq files -5. Trimming of reads to remove adaptor sequence (trim_galore) -6. Combining overlapping read pairs (FLASh) -7. In silico digestion of reads in fastq files -8. Alignment of fastq files with a user specified aligner (i.e. bowtie/bowtie2; BWA is not supported) -9. Analysis of alignment statistics (picard CollectAlignmentSummaryMetrics, multiqc) -10. Annotation of mapped reads with overlaps of capture probes, exclusion regions, blacklist, restriction fragments -11. Removal of non-reporter slices and indentification of reporters -12. Removal of PCR duplicates (exact coordinate matches) -13. Storage of reporters in `cooler format ` -14. Generation of bedgraphs/BigWigs. -15. Collation of run statistics and generation of a run report - - -Optional: - -* Generation of a UCSC track hub for visualisation. -* Differential interaction identification. -* Generation of subtraction bedgraphs for between condition comparisons -* Plotting of heatmaps. - - -@authors: asmith, dsims -""" - -from collections import defaultdict -import os -import re -import sys -import pickle -from cgatcore import pipeline as P -from cgatcore.iotools import touch_file, zap_file -import itertools -import warnings -import glob -import shutil -from cgatcore.pipeline.parameters import PARAMS -from pybedtools.bedtool import BedTool -import logging - -warnings.simplefilter("ignore", category=RuntimeWarning) - -import pandas as pd -from ruffus import ( - active_if, - add_inputs, - collate, - follows, - merge, - mkdir, - regex, - transform, - originate, -) -from capcruncher.tools.statistics import ( - collate_slice_data, - collate_read_data, - collate_cis_trans_data, - collate_histogram_data, - extract_trimming_stats, -) - -from capcruncher.utils import convert_bed_to_dataframe, is_on, is_none, is_valid_bed - - -############################## -# Set-up global parameters # -############################## - -# Override cgatcore default parameters with those specified by the config file -cgatcore_defaults_override = dict() -cgatcore_defaults_override["cluster"] = { - "queue_manager": P.PARAMS.get("pipeline_cluster_queue_manager", "slurm"), - "queue": P.PARAMS.get("cluster_queue", "batch"), -} -cgatcore_defaults_override["conda_env"] = P.PARAMS.get( - "conda_env", os.environ["CONDA_PREFIX"] -) - -# Load parameters into P.PARAMS -P.get_parameters("config.yml", user=False, defaults=cgatcore_defaults_override) - - -# Convert entries to the correct python type -for key in P.PARAMS: - if is_none(P.PARAMS[key]): - P.PARAMS[key] = None - elif is_on(P.PARAMS): - P.PARAMS[key] = True - - -###################### -# Set-up constants # -###################### - - -# Method of analysis -ANALYSIS_METHOD = P.PARAMS.get("analysis_method", "capture") - -# Defines storage format for slices. Defaults to parquet. -STORAGE_FORMAT = P.PARAMS.get("analysis_optional_storage_format", "parquet") - -# Determines if FASTQ files are compressed during processing -COMPRESS_FASTQ = P.PARAMS.get("pipeline_compression") != 0 - -# Determines the number of samples being processed -N_SAMPLES = len( - {re.match(r"(.*)_R*[12].fastq.*", fn).group(1) for fn in glob.glob("*.fastq*")} -) - -# Determines the number of viewpoints being processed -try: - N_VIEWPOINTS = BedTool(P.PARAMS["analysis_viewpoints"]).count() - HAS_HIGH_NUMBER_OF_VIEWPOINTS = ( - True - if N_VIEWPOINTS > 100 - and not P.PARAMS.get("analysis_optional_force_bigwig_generation") - else False - ) -except FileNotFoundError: - N_VIEWPOINTS = 0 - HAS_HIGH_NUMBER_OF_VIEWPOINTS = False - -# Determines if the design matrix supplied does exist -HAS_DESIGN = os.path.exists(P.PARAMS.get("analysis_design", "")) - -# Turns on FASTQ deduplication -FASTQ_DEDUPLICATE = P.PARAMS.get("deduplication_pre-dedup", False) - -# Determines if blacklist is used -HAS_BLACKLIST = is_valid_bed(P.PARAMS.get("analysis_optional_blacklist"), verbose=False) - -# Check if reporters need to be binned into even genomic regions -try: - - BINSIZES = [ - int(bs) - for bs in re.split(r"[,;]\s*|\s+", str(P.PARAMS.get("analysis_bin_sizes", "0"))) - ] - PERFORM_BINNING = True if all(bs > 0 for bs in BINSIZES) else False -except ValueError: - PERFORM_BINNING = False - - -# Check if the plotting packages are installed -try: - import coolbox - - MAKE_PLOTS = is_valid_bed(P.PARAMS.get("plot_coordinates"), verbose=False) -except ImportError as e: - logging.warning( - "Plotting capabilities not installed. For plotting please run: pip install capcruncher[plotting]" - ) - MAKE_PLOTS = False - -# Determines if UCSC hub is created from run. -MAKE_HUB = is_on(P.PARAMS.get("hub_create")) and not HAS_HIGH_NUMBER_OF_VIEWPOINTS -HUB_NAME = re.sub(r"[,\s+\t;:]", "_", P.PARAMS.get("hub_name", "")) - -# Warn about missing parameters -if not HAS_DESIGN: - logging.warning(f'Design matrix {P.PARAMS.get("analysis_design", "")} not found') - -if not MAKE_PLOTS: - logging.warning( - f'Plotting coordinates file {P.PARAMS.get("plot_coordinates")} is not correctly formatted. Will not perform plotting.' - ) - - -############################## -# Pipeline set-up functions # -############################## - - -def check_config(): - """ - Checks that all essential configuration has been provided. - """ - - if not os.path.exists("config.yml"): - raise OSError( - "Configuration file: config.yml. Not present in working directory" - ) - - essential_keys = [ - "analysis_method", - "analysis_viewpoints", - "analysis_restriction_enzyme", - "genome_name", - "genome_fasta", - "genome_aligner_index", - ] - - for key in essential_keys: - if not key in P.PARAMS: - raise ValueError( - f"No value provided for {key} in config.yml. Please correct this and re-run." - ) - - -def set_up_chromsizes(): - """ - Ensures that genome chromsizes are present. - - If chromsizes are not provided this function attempts to download them from UCSC. - The P.PARAMS dictionary is updated with the location of the chromsizes. - - """ - - assert P.PARAMS.get("genome_name"), "Genome name has not been provided." - - if P.PARAMS["genome_chrom_sizes"] and os.path.exists( - P.PARAMS["genome_chrom_sizes"] - ): - pass - - elif os.path.exists("chrom_sizes.txt.tmp"): - P.PARAMS["genome_chrom_sizes"] = "chrom_sizes.txt.tmp" - - else: - from pybedtools.helpers import get_chromsizes_from_ucsc - - get_chromsizes_from_ucsc(P.PARAMS["genome_name"], "chrom_sizes.txt.tmp") - P.PARAMS["genome_chrom_sizes"] = "chrom_sizes.txt.tmp" - - -def check_user_supplied_paths(): - - paths_to_check = [ - "genome_fasta", - "genome_aligner_index", - "analysis_viewpoints", - ] - - chrom_sizes = P.PARAMS["genome_chrom_sizes"] - if any(ext in chrom_sizes for ext in [".txt", ".fai", ".tsv"]): - paths_to_check.append("genome_chrom_sizes") - - for path_name in paths_to_check: - - path_supplied = P.PARAMS[path_name] - - if not os.path.exists(path_supplied): - - if path_name == "genome_aligner_index": - indicies = glob.glob(path_supplied + "*") - if not len(indicies) >= 1: - raise OSError(f"Supplied indicies at: {path_supplied} do not exist") - - else: - raise OSError( - f"Supplied path for {path_name}: {path_supplied} does not exist" - ) - - -################## -# Prepare genome # -################# - - -@follows(mkdir("capcruncher_preprocessing/restriction_enzyme_map/")) -@transform( - P.PARAMS.get("genome_fasta"), - regex(r".*/(.*).fa.*"), - r"capcruncher_preprocessing/restriction_enzyme_map/genome.digest.bed.gz", -) -def genome_digest(infile, outfile): - """ - In silco digestion of the genome to identify restriction fragment coordinates. - - Runs :ref:`capcruncher genome digest `. - - """ - tmp = outfile.replace(".gz", "") - statement_digest = " ".join( - [ - "capcruncher", - "genome", - "digest", - infile, - "-r", - P.PARAMS["analysis_restriction_enzyme"], - "-o", - tmp, - "--sort", - "-l", - outfile.replace(".bed.gz", ".stats"), - ] - ) - - statement_compress = " ".join(["pigz", "-p", "4", tmp]) - - P.run( - " && ".join([statement_digest, statement_compress]), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - ) - - -############################# -# Fastq file pre-processing # -############################# - -########## -# Read QC# -########## - - -@follows(mkdir("capcruncher_preprocessing"), mkdir("capcruncher_preprocessing/fastqc")) -@transform( - "*.fastq*", - regex(r"(.*).fastq.*"), - r"capcruncher_preprocessing/fastqc/\1_fastqc.zip", -) -def fastq_qc(infile, outfile): - """Runs fastqc on the input files to generate fastq statistics.""" - - outdir = os.path.dirname(outfile) - statement = " ".join( - [ - "fastqc", - infile, - "-q", - "-t", - str(P.PARAMS["pipeline_n_cores"]), - "--nogroup", - "--outdir", - outdir, - ] - ) - - P.run( - statement, - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_condaenv=P.PARAMS["conda_env"], - ) - - -@merge(fastq_qc, "capcruncher_statistics/fastqc_report.html") -def fastq_multiqc(infile, outfile): - """Collate fastqc reports into single report using multiqc""" - - basename = os.path.basename(outfile) - dirname = os.path.dirname(outfile) - - statement_cleanup = " ".join(["rm", "-f", outfile]) - statement_export_1 = " ".join(["export", "LC_ALL=en_US.UTF-8"]) - statement_export_2 = " ".join(["export", "LANG=en_US.UTF-8"]) - statement_multiqc = " ".join( - ["multiqc", "capcruncher_preprocessing/fastqc/", "-o", dirname, "-n", basename] - ) - - P.run( - " && ".join( - [ - statement_cleanup, - statement_export_1, - statement_export_2, - statement_multiqc, - ] - ), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_memory="2G", - job_condaenv=P.PARAMS["conda_env"], - ) - - -################### -# Read processing # -################### - - -@follows(mkdir("capcruncher_preprocessing/split")) -@collate( - "*.fastq*", - regex(r"(.*)_R*[12].fastq.*"), - r"capcruncher_preprocessing/split/\1.sentinel", -) -def fastq_split(infiles, outfile): - """ - Splits the input fastq files into chunks for parallel processing - - Runs :ref:`capcruncher fastq split `. - - """ - statement = [ - "capcruncher", - "fastq", - "split", - " ".join(infiles), - "-m", - "unix", - "-o", - outfile.replace(".sentinel", ""), - "-n", - str(P.PARAMS.get("split_n_reads", 1e6)), - "--gzip" if COMPRESS_FASTQ else "--no-gzip", - f"--compression_level {P.PARAMS.get('pipeline_compression')}" - if COMPRESS_FASTQ - else "", - "-p", - str(P.PARAMS["pipeline_n_cores"]), - ] - - P.run( - " ".join(statement), - job_threads=P.PARAMS["pipeline_n_cores"], - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_condaenv=P.PARAMS["conda_env"], - ) - - # Create sentinel file - touch_file(outfile) - - -@active_if(FASTQ_DEDUPLICATE) -@follows( - mkdir("capcruncher_preprocessing/deduplicated"), - mkdir("capcruncher_preprocessing/deduplicated/duplicated_ids"), - fastq_split, -) -@collate( - "capcruncher_preprocessing/split/*.fastq*", - regex(r"capcruncher_preprocessing/split/(.*)_part(\d+)_[12].fastq(.gz)?"), - r"capcruncher_preprocessing/deduplicated/duplicated_ids/\1_\2.pkl\3", - extras=[r"\1", r"\2"], -) -def fastq_duplicates_parse(infiles, outfile, sample_name, part_no): - - """ - Parses fastq files into json format for sequence deduplication. - - Runs :ref:`capcruncher fastq deduplicate parse ` - - """ - - statement = [ - "capcruncher", - "fastq", - "deduplicate", - "parse", - *[os.path.abspath(fn) for fn in infiles], - "-o", - outfile, - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=2, - job_total_memory="6G", - job_condaenv=P.PARAMS["conda_env"], - ) - - -@collate( - fastq_duplicates_parse, - regex(r"capcruncher_preprocessing/deduplicated/duplicated_ids/(.*)_\d*.pkl(.gz)?"), - r"capcruncher_preprocessing/deduplicated/duplicated_ids/\1.pkl\2", -) -def fastq_duplicates_identify(infiles, outfile): - - """ - Identifies duplicate sequences from parsed fastq files in json format. - - Runs :ref:`capcruncher fastq deduplicate identify ` - - """ - - statement = [ - "capcruncher", - "fastq", - "deduplicate", - "identify", - " ".join(infiles), - "-o", - outfile, - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_total_memory="32G", - job_condaenv=P.PARAMS["conda_env"], - ) - - for fn in infiles: - zap_file(fn) - - -@follows( - fastq_duplicates_parse, - fastq_duplicates_identify, - mkdir("capcruncher_statistics/deduplication/data/"), -) -@collate( - "capcruncher_preprocessing/split/*.fastq*", - regex(r".*/(.*_part\d+)_[12].fastq(?:.gz)?"), - r"capcruncher_preprocessing/deduplicated/\1.sentinel", -) -def fastq_duplicates_remove(infiles, outfile): - - """ - Removes duplicate read fragments identified from parsed fastq files. - """ - - sample = re.match(r".*/(.*)(_part\d+)_[12].fastq(.gz)?", infiles[0]) - sample_name = sample.group(1) - sample_part = sample.group(2) - sample_compression = sample.group(3) - output_prefix = outfile.replace(".sentinel", "") - stats_prefix = ( - f"capcruncher_statistics/deduplication/data/{sample_name}{sample_part}" - ) - - if FASTQ_DEDUPLICATE: - statement = " ".join( - [ - "capcruncher", - "fastq", - "deduplicate", - "remove", - *infiles, - "-d", - f"capcruncher_preprocessing/deduplicated/duplicated_ids/{sample_name}.pkl{'.gz' if sample_compression else ''}", - "--sample-name", - sample_name, - "--stats-prefix", - stats_prefix, - "-o", - output_prefix, - "-p", - str(P.PARAMS.get("pipeline_n_cores", "4")), - "--hash-read-name", # Reduces memory by converting the readname to a 64bit hash - "--gzip" if ".gz" in infiles[0] else "--no-gzip", - ] - ) - - else: - statement = f"""ln -s $(pwd)/{infiles[0]} {output_prefix}_1.fastq && - ln -s $(pwd)/{infiles[1]} {output_prefix}_2.fastq && - lc=$(cat {infiles[0]} | wc -l); - statsfile={stats_prefix}.deduplication.csv; - echo "stat,stat_type,read_type,read_number,stage,sample" > $statsfile; - echo -e $(($lc / 4)),reads_total,pe,0,deduplication,{sample_name} >> $statsfile; - echo -e $(($lc / 4)),reads_unique,pe,0,deduplication,{sample_name} >> $statsfile; - echo -e 0,reads_removed,pe,0,deduplication,{sample_name} >> $statsfile - """ - - P.run( - statement, - job_threads=P.PARAMS["pipeline_n_cores"], - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_memory="6G", - job_condaenv=P.PARAMS["conda_env"], - ) - - # Make sentinel file - touch_file(outfile) - - # Replace infiles with empty files - if FASTQ_DEDUPLICATE: - for fn in infiles: - zap_file(fn) - - -@follows(fastq_duplicates_remove) -@merge( - "capcruncher_statistics/deduplication/data/*.csv", - "capcruncher_statistics/deduplication/deduplication.reads.csv", -) -def stats_deduplication_collate(infiles, outfile): - - """Combines deduplication statistics from fastq file partitions.""" - - stats_prefix = outfile.replace(".reads.csv", "") - - df_stats = collate_read_data(infiles) - - df_stats_read = df_stats.query('stat_type != "reads_removed"') - - df_stats.to_csv(f"{stats_prefix}.summary.csv", index=False) - - # Modified to enable more streamlined summary at final stage - df_stats_read.to_csv(outfile, index=False) - - -@follows( - mkdir("capcruncher_preprocessing/trimmed"), - fastq_duplicates_remove, - mkdir("capcruncher_statistics/trimming/data/"), -) -@collate( - "capcruncher_preprocessing/deduplicated/*.fastq*", - regex(r"capcruncher_preprocessing/deduplicated/(.*)_[12].fastq(?:.gz)?"), - r"capcruncher_preprocessing/trimmed/\1.sentinel", -) -def fastq_trim(infiles, outfile): - - """Trim adaptor sequences from fastq files using trim_galore""" - - statement_trim = " ".join( - [ - "trim_galore", - " ".join(infiles), - "-o", - os.path.dirname(outfile), - "--paired", - "--cores", - str(P.PARAMS.get("pipeline_n_cores", 1)), - "--gzip" if COMPRESS_FASTQ else "", - P.PARAMS.get("trim_options") or " ", - ] - ) - statement_stats_1 = " ".join( - [ - "mv", - f"capcruncher_preprocessing/trimmed/{os.path.basename(infiles[0])}_trimming_report.txt", - "capcruncher_statistics/trimming/data", - ] - ) - statement_stats_2 = " ".join( - [ - "mv", - f"capcruncher_preprocessing/trimmed/{os.path.basename(infiles[1])}_trimming_report.txt", - "capcruncher_statistics/trimming/data", - ] - ) - - P.run( - " && ".join([statement_trim, statement_stats_1, statement_stats_2]), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_condaenv=P.PARAMS["conda_env"], - ) - - # Make sentinel file - touch_file(outfile) - - # Zero input files to save space - for fn in infiles: - zap_file(fn) - - -@follows(fastq_trim) -@merge( - "capcruncher_statistics/trimming/data/*.txt", - r"capcruncher_statistics/trimming/trimming.summary.csv", -) -def stats_trim_collate(infiles, outfile): - - """Extracts and collates adapter trimming statistics from trim_galore output""" - - trimming_stats = [] - for fn in infiles: - stats = extract_trimming_stats(fn) - trimming_stats.append(stats) - - df_trimming_stats = ( - pd.DataFrame(trimming_stats) - .groupby(["sample", "read_number", "read_type"]) - .sum() - .reset_index() - .melt( - id_vars=["sample", "read_number", "read_type"], - var_name="stat_type", - value_name="stat", - ) - ) - - df_trimming_stats.to_csv( - "capcruncher_statistics/trimming/trimming.summary.csv", index=False - ) - - -@follows(fastq_trim, mkdir("capcruncher_preprocessing/flashed")) -@collate( - "capcruncher_preprocessing/trimmed/*.fq*", - regex(r"capcruncher_preprocessing/trimmed/(.*)_[12]_.*.fq(?:.gz)?"), - r"capcruncher_preprocessing/flashed/\1.sentinel", -) -def fastq_flash(infiles, outfile): - - """Combine overlapping paired-end reads using FLASh""" - - statement = [ - "flash", - " ".join(infiles), - "-o", - outfile.replace(".sentinel", ""), - "-t", - str(P.PARAMS.get("pipeline_n_cores", 1)), - "-z" if COMPRESS_FASTQ else "", - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_condaenv=P.PARAMS["conda_env"], - ) - - touch_file(outfile) - - if not P.PARAMS.get("analysis_optional_keep_trimmed"): - for fn in infiles: - zap_file(fn) - - -@follows( - mkdir("capcruncher_preprocessing/collated"), - fastq_flash, -) -@collate( - "capcruncher_preprocessing/flashed/*.fastq*", - regex( - r"capcruncher_preprocessing/flashed/(.*)_part\d+.extendedFrags.fastq(?:.gz)?" - ), - r"capcruncher_preprocessing/collated/flashed.\1.sentinel", -) -def fastq_collate_combined(infiles, outfile): - - statement = [ - "capcruncher", - "fastq", - "split", - ",".join(infiles), - "-m", - "unix", - "-o", - outfile.replace(".sentinel", ""), - "-n", - str(P.PARAMS.get("split_n_reads", 1e6)), - "--gzip" if COMPRESS_FASTQ else "--no-gzip", - f"--compression_level {P.PARAMS.get('pipeline_compression')}" - if COMPRESS_FASTQ - else "", - "-p", - str(P.PARAMS["pipeline_n_cores"]), - ] - - P.run( - " ".join(statement), - job_threads=P.PARAMS["pipeline_n_cores"], - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_condaenv=P.PARAMS["conda_env"], - ) - - # Create sentinel file - touch_file(outfile) - - for fn in infiles: - zap_file(fn) - - -@follows( - mkdir("capcruncher_preprocessing/collated"), - fastq_flash, -) -@collate( - "capcruncher_preprocessing/flashed/*.fastq*", - regex( - r"capcruncher_preprocessing/flashed/(.*)_part\d+.notCombined_[12].fastq(?:.gz)?" - ), - r"capcruncher_preprocessing/collated/pe.\1.sentinel", -) -def fastq_collate_non_combined(infiles, outfile): - - df_fq_files = pd.Series(infiles).to_frame("fn") - df_fq_files["read"] = df_fq_files["fn"].str.extract(".*.notCombined_(\d)") - infiles_by_read_number = [ - df["fn"].to_list() for read_number, df in df_fq_files.groupby("read") - ] - - statement = [ - "capcruncher", - "fastq", - "split", - ",".join(infiles_by_read_number[0]), - ",".join(infiles_by_read_number[1]), - "-m", - "unix", - "-o", - outfile.replace(".sentinel", ""), - "-n", - str(P.PARAMS.get("split_n_reads", 1e6)), - "--gzip" if COMPRESS_FASTQ else "--no-gzip", - f"--compression_level {P.PARAMS.get('pipeline_compression')}" - if COMPRESS_FASTQ - else "", - "-p", - str(P.PARAMS["pipeline_n_cores"]), - ] - - P.run( - " ".join(statement), - job_threads=P.PARAMS["pipeline_n_cores"], - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_condaenv=P.PARAMS["conda_env"], - ) - - # Create sentinel file - touch_file(outfile) - - for fn in infiles: - zap_file(fn) - - -@follows(fastq_collate_combined, mkdir("capcruncher_statistics/digestion/data")) -@transform( - "capcruncher_preprocessing/collated/flashed*.fastq*", - regex(r".*/flashed.(.*)_[1].fastq(.gz)?"), - r"capcruncher_preprocessing/digested/\1.flashed.fastq\2", - extras=[r"\1"], -) -def fastq_digest_combined(infile, outfile, sample_name): - - """In silico restriction enzyme digest of combined (flashed) read pairs""" - - n_cores = P.PARAMS["pipeline_n_cores"] - 2 - n_cores = n_cores if n_cores > 1 else 1 - - statement = [ - "capcruncher", - "fastq", - "digest", - infile, - "-o", - outfile, - "-m", - "flashed", - "-r", - P.PARAMS["analysis_restriction_enzyme"], - "--minimum_slice_length", - "18", - "--stats-prefix", - f"capcruncher_statistics/digestion/data/{os.path.basename(outfile)}", - "--sample-name", - re.match(r"(.*?)_part.*", sample_name).group(1), - "-p", - str(n_cores), - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=2 + n_cores, - job_condaenv=P.PARAMS["conda_env"], - ) - - zap_file(infile) - - -@follows(fastq_collate_non_combined, mkdir("capcruncher_statistics/digestion/data")) -@collate( - "capcruncher_preprocessing/collated/pe.*.fastq*", - regex(r".*/pe.(.*)_[12].fastq(.gz)?"), - r"capcruncher_preprocessing/digested/\1.pe.fastq\2", - extras=[r"\1"], -) -def fastq_digest_non_combined(infiles, outfile, sample_name): - - """In silico restriction enzyme digest of non-combined (non-flashed) read pairs""" - - n_cores = P.PARAMS["pipeline_n_cores"] - 2 - n_cores = n_cores if n_cores > 1 else 1 - - statement = [ - "capcruncher", - "fastq", - "digest", - " ".join(infiles), - "-o", - outfile, - "-m", - "pe", - "-r", - P.PARAMS["analysis_restriction_enzyme"], - "--minimum_slice_length", - "18", - "--stats-prefix", - f"capcruncher_statistics/digestion/data/{os.path.basename(outfile)}", - "--sample-name", - re.match(r"(.*?)_part.*", sample_name).group(1), - "-p", - str(n_cores), - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=2 + n_cores, - job_condaenv=P.PARAMS["conda_env"], - ) - - for fn in infiles: - zap_file(fn) - - -@follows(fastq_digest_combined, fastq_digest_non_combined) -@merge( - "capcruncher_statistics/digestion/data/*", - "capcruncher_statistics/digestion/digestion.reads.csv", -) -def stats_digestion_collate(infiles, outfile): - - """Aggregates in silico digestion statistics from fastq file partitions.""" - - stats_prefix = outfile.replace(".reads.csv", "") - data = defaultdict(list) - - for fn in infiles: - if ".filtered.histogram" in fn: - data["hist_filt"].append(fn) - elif ".unfiltered.histogram" in fn: - data["hist_unfilt"].append(fn) - elif ".slice" in fn: - data["slice"].append(fn) - elif ".read" in fn: - data["read"].append(fn) - - # Collate histogram, read and slice statistics - df_hist_filt = collate_histogram_data(data["hist_filt"]) - df_hist_unfilt = collate_histogram_data(data["hist_unfilt"]) - df_read = collate_read_data(data["read"]) - - # Merge filtered and unfiltered histograms - df_hist = pd.concat( - [df_hist_unfilt.assign(filtered=0), df_hist_filt.assign(filtered=1)] - ).sort_values(["sample", "read_type", "n_slices"]) - - # Output histogram, slice and read statics - df_hist.to_csv(f"{stats_prefix}.histogram.csv", index=False) - df_read.to_csv(outfile, index=False) - - -@follows(fastq_digest_combined, fastq_digest_non_combined) -def fastq_preprocessing(): - pass - - -################################# -# Read alignment and processing # -################################# - - -@follows(mkdir("capcruncher_preprocessing"), fastq_preprocessing) -@transform( - [fastq_digest_combined, fastq_digest_non_combined], - regex(r"capcruncher_preprocessing/digested/(.*).fastq(.gz)?"), - r"capcruncher_preprocessing/aligned/\1.bam", -) -def fastq_alignment(infile, outfile): - - """Aligns in silico digested fastq files to the genome.""" - - statement_align = " ".join( - [ - P.PARAMS.get("align_aligner", "bowtie2"), - P.PARAMS.get("align_index_flag") or " ", - P.PARAMS["genome_aligner_index"], - P.PARAMS.get("align_options") or "", - infile, - ] - ) - - statement_samtools_view = " ".join(["samtools", "view", "-b", "-S", ">", outfile]) - statement_samtools_sort = " ".join( - [ - "samtools", - "sort", - outfile, - "-o", - f"{outfile}.sorted.bam", - "-m", - "2G", - "-@", - str(P.PARAMS["pipeline_n_cores"]), - ] - ) - statement_rename_bam = " ".join(["mv", "-f", f"{outfile}.sorted.bam", outfile]) - - P.run( - " ".join( - [ - statement_align, - "|", - statement_samtools_view, - "&&", - statement_samtools_sort, - "&&", - statement_rename_bam, - ] - ), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_memory="2G", - job_condaenv=P.PARAMS["conda_env"], - ) - - if not P.PARAMS.get("analysis_optional_keep_digested"): - zap_file(infile) - - -@collate( - fastq_alignment, - regex(r"capcruncher_preprocessing/aligned/(.*)_part\d+.*.bam"), - r"capcruncher_preprocessing/aligned/\1.bam", -) -def alignments_merge(infiles, outfile): - """ - Combines bam files (by flashed/non-flashed status and sample). - - This task simply provides an input for picard CollectAlignmentSummaryMetrics - and is only used to provide overall mapping statistics. Fastq partitions - are *not* combined at this stage. - - """ - - statement = ["samtools", "merge", outfile, " ".join(infiles)] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - ) - - -@transform(alignments_merge, regex("(.*).bam"), r"\1.bam.bai") -def alignments_index(infile, outfile): - - """Indexes all bam files""" - - statement = [ - "samtools", - "index", - infile, - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_memory="1G", - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - ) - - -@follows(fastq_alignment, alignments_merge) -def pre_annotation(): - pass - - -############################ -# Annotation of alignments # -############################ - - -@originate("capcruncher_analysis/annotations/exclude.bed") -def annotate_make_exclusion_bed(outfile): - - """Generates exclusion window around each capture site""" - - if not is_valid_bed(P.PARAMS["analysis_viewpoints"]): - raise ValueError("Viewpoints bed file is not a valid bed file") - - statement_bedtools_slop = " ".join( - [ - "bedtools", - "slop", - "-i", - P.PARAMS["analysis_viewpoints"], - "-g", - P.PARAMS["genome_chrom_sizes"], - "-b", - str(P.PARAMS["analysis_reporter_exclusion_zone"]), - ] - ) - statement_bedtools_subtract = " ".join( - ["bedtools", "subtract", "-a", "-", "-b", P.PARAMS["analysis_viewpoints"]] - ) - statement_sort = " ".join(["sort", "-k1,1", "-k2,2n", ">", outfile]) - - P.run( - "|".join( - [statement_bedtools_slop, statement_bedtools_subtract, statement_sort] - ), - job_threads=1, - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_condaenv=P.PARAMS["conda_env"], - without_cluster=True, - ) - - -@originate("capcruncher_analysis/annotations/viewpoints.bed") -def annotate_sort_viewpoints(outfile): - - """Sorts the capture oligos for bedtools intersect with --sorted option""" - - if not is_valid_bed(P.PARAMS["analysis_viewpoints"]): - raise ValueError("Viewpoints bed file is not a valid bed file") - - statement = [ - "sort", - "-k1,1", - "-k2,2n", - P.PARAMS["analysis_viewpoints"], - ">", - outfile, - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - without_cluster=True, - ) - - -@originate("capcruncher_analysis/annotations/blacklist.bed") -def annotate_sort_blacklist(outfile): - - """Sorts the capture oligos for bedtools intersect with --sorted option""" - - blacklist_file = P.PARAMS.get("analysis_optional_blacklist") - - if HAS_BLACKLIST and blacklist_file.endswith(".bed"): - statement = [ - "sort", - "-k1,1", - "-k2,2n", - blacklist_file, - ">", - outfile, - ] - elif HAS_BLACKLIST and blacklist_file.endswith(".bed.gz"): - statement = [ - "zcat", - blacklist_file, - "|", - "sort", - "-k1,1", - "-k2,2n", - ">", - outfile, - ] - - else: - statement = ["touch", outfile] # Make a blank file if no blacklist - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - without_cluster=True, - ) - - -@follows( - genome_digest, - annotate_make_exclusion_bed, - annotate_sort_viewpoints, - annotate_sort_blacklist, -) -@transform( - fastq_alignment, - regex(r".*/(.*).bam"), - add_inputs( - { - "restriction_fragment": { - "name": "restriction_fragment", - "fn": "capcruncher_preprocessing/restriction_enzyme_map/genome.digest.bed.gz", - "action": "get", - "fraction": 0.51, - "dtype": "Int64", - }, - "viewpoints": { - "name": "capture", - "fn": "capcruncher_analysis/annotations/viewpoints.bed", - "action": "get", - "fraction": P.PARAMS.get( - "analysis_optional_minimum_viewpoint_overlap", 0.75 - ), - "dtype": "category", - }, - "exclusions": { - "name": "exclusion", - "fn": "capcruncher_analysis/annotations/exclude.bed", - "action": "get", - "fraction": 1e-9, - "dtype": "category", - }, - "exclusions_count": { - "name": "exclusion_count", - "fn": "capcruncher_analysis/annotations/exclude.bed", - "action": "count", - "fraction": 1e-9, - "dtype": "Int8", - }, - "viewpoint_count": { - "name": "capture_count", - "fn": "capcruncher_analysis/annotations/viewpoints.bed", - "action": "count", - "fraction": P.PARAMS.get( - "analysis_optional_minimum_viewpoint_overlap", 0.75 - ), - "dtype": "Int8", - }, - "blacklist": { - "name": "blacklist", - "fn": "capcruncher_analysis/annotations/blacklist.bed", - "action": "count", - "fraction": 1e-9, - "dtype": "int", - }, - } - ), - r"capcruncher_analysis/annotations/\1.annotations.parquet", -) -def annotate_alignments(infile, outfile): - - """ - Annotates mapped read slices. - - Slices are annotated with: - * capture name - * capture count - * exclusion name - * exclusion count - * blacklist count - * restriction fragment number - """ - - flags = { - "name": "-n", - "fn": "-b", - "action": "-a", - "fraction": "-f", - "dtype": "-t", - } - - priority_chroms = P.PARAMS.get("analysis_optional_priority_chromosomes") - - if not priority_chroms: - chroms = None - elif "," in priority_chroms: - chroms = priority_chroms - elif "viewpoints" in priority_chroms: - chroms = ",".join( - convert_bed_to_dataframe(P.PARAMS["analysis_viewpoints"])["chrom"] - ) - - statement_annotate = " ".join( - [ - "capcruncher", - "alignments", - "annotate", - infile[0], - *[ - f"{flags[attribute]} {value}" - for annotation_name, annotation in infile[1].items() - for attribute, value in annotation.items() - ], - "-o", - outfile, - "--invalid_bed_action", - "ignore", - "-p", - str(P.PARAMS["pipeline_n_cores"]), - f"--blacklist {infile[1]['blacklist']['fn']}" if HAS_BLACKLIST else "", - "--prioritize-cis-slices" - if P.PARAMS.get("analysis_optional_prioritize_cis_slices") - else "", - f"--priority-chroms {chroms}" if chroms else "", - ] - ) - - P.run( - statement_annotate, - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_condaenv=P.PARAMS["conda_env"], - ) - - -@follows(fastq_preprocessing, annotate_alignments) -def post_annotation(): - """Runs the pipeline until just prior to identification of reporters""" - pass - - -########################### -# Filtering of alignments # -########################### - - -@follows( - post_annotation, - annotate_alignments, - mkdir("capcruncher_statistics/reporters/data"), -) -@transform( - fastq_alignment, - regex(r".*/(.*)_(part\d+).(flashed|pe).bam"), - add_inputs(r"capcruncher_analysis/annotations/\1_\2.\3.annotations.parquet"), - r"capcruncher_analysis/reporters/identified/\1/\1.\2.\3.sentinel", - extras=[r"\1", r"\2", r"\3"], -) -def alignments_filter(infiles, outfile, sample_name, sample_part, sample_read_type): - """Filteres slices and outputs reporter slices for each capture site""" - - bam, annotations = infiles - output_prefix = outfile.replace(".sentinel", "") - stats_prefix = f"capcruncher_statistics/reporters/data/{sample_name}_{sample_part}_{sample_read_type}" - custom_filtering = P.PARAMS.get( - "analysis_optional_custom_filtering", "NO_PATH_PROVIDED" - ) - - statement = [ - "capcruncher", - "alignments", - "filter", - P.PARAMS["analysis_method"], - "-b", - bam, - "-a", - annotations, - "-o", - output_prefix, - "--stats-prefix", - stats_prefix, - "--sample-name", - sample_name, - "--read-type", - sample_read_type, - "--no-cis-and-trans-stats", - f"--custom-filtering {custom_filtering}" - if os.path.exists(custom_filtering) - else "", - "--output-format", - STORAGE_FORMAT, - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_total_memory=P.PARAMS.get("alignments_filter_memory", "5G"), - job_condaenv=P.PARAMS["conda_env"], - ) - - if not P.PARAMS.get("analysis_optional_keep_annotations", False): - zap_file(annotations) - - # Make sentinel file - touch_file(outfile) - - -@follows(mkdir("capcruncher_analysis/reporters/deduplicated/fragments")) -@collate( - alignments_filter, - regex(r".*/(?P.*).part\d+.(flashed|pe).sentinel"), - r"capcruncher_analysis/reporters/deduplicated/fragments/\1.\2.pkl", - extras=[r"\2"], -) -def alignments_deduplicate_fragments(infiles, outfile, read_type): - - """ - Identifies duplicate fragments with the same coordinates and order. - """ - - fragments = [ - infile.replace("sentinel", f"fragments.{STORAGE_FORMAT}") for infile in infiles - ] - - statement = [ - "capcruncher", - "alignments", - "deduplicate", - "identify", - *[fn for fn in fragments if os.path.exists(fn)], - "--read-type", - read_type, - "-o", - outfile, - "--file-type", - STORAGE_FORMAT, - "-p", - str(P.PARAMS.get("pipeline_n_cores", 1)), - "--memory-limit", - str(P.PARAMS["pipeline_memory"]), - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_total_memory=P.PARAMS["pipeline_memory"], - job_condaenv=P.PARAMS["conda_env"], - ) - - # Zero fragments - for fn in fragments: - try: - zap_file(fn) - except FileNotFoundError: - pass - - -@follows(alignments_deduplicate_fragments) -@collate( - alignments_filter, - regex(r".*/(?P.*).part(?P\d+)\.(?Pflashed|pe)\.sentinel"), - add_inputs(r"capcruncher_analysis/reporters/deduplicated/fragments/\1.\3.pkl"), - r"capcruncher_analysis/reporters/deduplicated/\1.\3.sentinel", - extras=[ - r"\1", - r"\3", - ], -) -def alignments_deduplicate_slices(infile, outfile, sample_name, read_type): - - """Removes reporters with duplicate coordinates and merges partitions.""" - - slices, duplicated_ids = list(zip(*infile)) - slices = [fn.replace("sentinel", f"slices.{STORAGE_FORMAT}") for fn in slices] - duplicated_ids = duplicated_ids[0] # All id paths are the same, just need one. - - stats_prefix = f"capcruncher_statistics/reporters/data/{sample_name}_{read_type}" - - statement = [ - "capcruncher", - "alignments", - "deduplicate", - "remove", - *[fn for fn in slices if os.path.exists(fn)], - "-d", - duplicated_ids, - "-o", - outfile.replace("sentinel", STORAGE_FORMAT), - "--stats-prefix", - stats_prefix, - "--sample-name", - sample_name, - "--read-type", - read_type, - "-p", - str(P.PARAMS.get("pipeline_n_cores", 1)), - "--memory-limit", - str(P.PARAMS["pipeline_memory"]), - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_total_memory=P.PARAMS["pipeline_memory"], - job_condaenv=P.PARAMS["conda_env"], - ) - - # Zero non-deduplicated reporters - for fn in slices: - try: - zap_file(fn) - except FileNotFoundError: - pass - - # Make sentinel file - touch_file(outfile) - - -@transform( - alignments_deduplicate_slices, - regex(r".*/(.*?)\.(.*?)\.sentinel"), - r"capcruncher_statistics/reporters/data/\1_\2.reporter.stats.csv", - extras=[r"\1", r"\2"], -) -def alignments_deduplicate_slices_statistics( - infile, - outfile, - sample, - read_type, -): - - """Generates reporter statistics from de-duplicated files""" - - slices = infile.replace("sentinel", STORAGE_FORMAT) - - statement = [ - "capcruncher", - "utilities", - "cis-and-trans-stats", - slices, - "-m", - P.PARAMS["analysis_method"], - "-o", - outfile, - "--sample-name", - sample, - "--read-type", - read_type, - "--file-type", - STORAGE_FORMAT, - "-p", - str(P.PARAMS["pipeline_n_cores"]), - "--memory-limit", - str(P.PARAMS["pipeline_memory"]), - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_condaenv=P.PARAMS["conda_env"], - ) - - -@follows(alignments_deduplicate_slices_statistics) -@collate( - alignments_deduplicate_slices, - regex(r".*/(?P.*)\.(?:flashed|pe)\.sentinel"), - r"capcruncher_analysis/reporters/\1.sentinel", -) -def alignments_deduplicate_collate(infiles, outfile): - - """Final collation of reporters by sample""" - - args = list() - if STORAGE_FORMAT == "hdf5": - slices = [fn.replace("sentinel", STORAGE_FORMAT) for fn in infiles] - args.extend(["-i", "viewpoint"]) - elif STORAGE_FORMAT == "parquet": - # slices = [fn.replace("sentinel", STORAGE_FORMAT) for fn in infiles] - slices = list( - itertools.chain.from_iterable( - glob.glob(f"{fn.replace('sentinel', 'parquet')}/*.parquet") - for fn in infiles - ) - ) - - statement_merge = [ - "capcruncher", - "utilities", - "merge-capcruncher-slices", - *slices, - "-o", - outfile.replace("sentinel", STORAGE_FORMAT), - *args, - "-p", - str(P.PARAMS.get("pipeline_n_cores", 1)), - ] - - P.run( - " ".join(statement_merge), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_condaenv=P.PARAMS["conda_env"], - ) - - for fn in slices: - zap_file(fn) - - touch_file(outfile) - - -@follows(alignments_deduplicate_collate, alignments_deduplicate_slices_statistics) -@merge( - "capcruncher_statistics/reporters/data/*", - "capcruncher_statistics/reporters/reporters.reads.csv", -) -def stats_alignment_filtering_collate(infiles, outfile): - - """'Combination of all reporter identification and filtering statistics""" - - stats_prefix = outfile.replace(".reads.csv", "") - data = defaultdict(list) - - for fn in infiles: - if ".read.stats" in fn: - data["read"].append(fn) - elif ".slice.stats" in fn: - data["slice"].append(fn) - elif ".reporter.stats" in fn: - data["reporter"].append(fn) - - # Slice data - collate_slice_data(data["slice"]).to_csv(f"{stats_prefix}.slices.csv", index=False) - - # Reporter data - collate_cis_trans_data(data["reporter"]).to_csv( - f"{stats_prefix}.reporters.csv", index=False - ) - - # Read data - collate_read_data(data["read"]).to_csv(outfile, index=False) - - -@follows(alignments_deduplicate_collate, stats_alignment_filtering_collate) -def post_capcruncher_analysis(): - """Reporters have been identified, deduplicated and collated by sample/capture probe""" - - # Zero bam files if not specified - if not P.PARAMS.get("analysis_optional_keep_aligned", False): - for bam in glob.glob("capcruncher_preprocessing/aligned/*.bam"): - zap_file(bam) - - -#################### -# Reporter storage # -#################### - - -@follows(mkdir("capcruncher_analysis/reporters/counts")) -@transform( - alignments_deduplicate_collate, - regex(r".*/(?P.*?)\.sentinel"), - add_inputs(genome_digest, P.PARAMS["analysis_viewpoints"]), - r"capcruncher_analysis/reporters/counts/\1_fragments.sentinel", -) -def reporters_count(infile, outfile): - - """Counts the number of interactions identified between reporter restriction fragments""" - - infile, restriction_fragment_map, viewpoints = infile - output_counts = outfile.replace("_fragments.sentinel", ".hdf5") - - statement = [ - "capcruncher", - "reporters", - "count", - infile.replace("sentinel", STORAGE_FORMAT), - "-o", - output_counts, - "-f", - restriction_fragment_map, - "-v", - viewpoints, - "--cooler-output", - "-p", - str(P.PARAMS["pipeline_n_cores"]), - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_condaenv=P.PARAMS["conda_env"], - ) - - touch_file(outfile) - - -@active_if(PERFORM_BINNING) -@follows(genome_digest, mkdir("capcruncher_analysis/reporters/")) -@originate(r"capcruncher_analysis/reporters/binners.pkl") -def generate_bin_conversion_tables(outfile): - """ - Converts restriction fragments to genomic bins. - - Binning restriction fragments into genomic bins takes a substantial - amount of time and memory. To avoid repeatedly performing the same action, - bin conversion tables are calculated once for each required resolution and - then stored as a pickle file. - - """ - - from capcruncher.tools.storage import GenomicBinner - - frags = pd.read_csv( - "capcruncher_preprocessing/restriction_enzyme_map/genome.digest.bed.gz", - sep="\t", - names=["chrom", "start", "end", "name"], - ) - - binner_dict = dict() - for bs in BINSIZES: - try: - gb = GenomicBinner( - chromsizes=P.PARAMS["genome_chrom_sizes"], - fragments=frags, - binsize=int(bs), - ) - bct = ( - gb.bin_conversion_table - ) # Property is cached so need to call it to make sure it is present. - binner_dict[int(bs)] = gb - - except ValueError: - logging.warn(f"Failed to generate binning object with bin size: {bs}") - - with open(outfile, "wb") as w: - pickle.dump(binner_dict, w) - - -@active_if(PERFORM_BINNING) -@follows( - generate_bin_conversion_tables, - reporters_count, -) -@transform( - reporters_count, - regex(r".*/(?P.*?)\_fragments.sentinel"), - add_inputs(generate_bin_conversion_tables), - r"capcruncher_analysis/reporters/counts/\1_binned.sentinel", -) -def reporters_store_binned(infile, outfile): - - """ - Converts a cooler file of restriction fragments to even genomic bins. - """ - - clr, conversion_tables = infile - clr = clr.replace("_fragments.sentinel", ".hdf5") - sentinel_file = outfile - outfile = outfile.replace("_binned.sentinel", ".hdf5") - - scale_factor = P.PARAMS.get("normalisation_scale_factor", 1000000) - scale_factor = scale_factor if not is_none(scale_factor) else int(1e6) - - # Store counts into bins - statement = [ - "capcruncher", - "reporters", - "store", - "bins", - clr, - *[ - f"-b {bin_size}" - for bin_size in BINSIZES - ], - "--conversion_tables", - conversion_tables, - "--normalise", - "--scale_factor", - str(scale_factor), - "-p", - str(P.PARAMS["pipeline_n_cores"]), - "-o", - f"{outfile}.binned.tmp", - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_condaenv=P.PARAMS["conda_env"], - ) - - # Merge fragment counts and binned counts into the same file - statement = [ - "capcruncher", - "reporters", - "store", - "merge", - clr, - f"{outfile}.binned.tmp", - "-o", - f"{outfile}.tmp", - "&&", - "mv", - f"{outfile}.tmp", - outfile, - "&&", - "rm", - f"{outfile}.binned.tmp", - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=P.PARAMS["pipeline_n_cores"], - job_condaenv=P.PARAMS["conda_env"], - ) - - # Link bin tables to conserve space - from capcruncher.tools.storage import link_common_cooler_tables - - link_common_cooler_tables(outfile) - - # Make sentinel file - touch_file(sentinel_file) - - -####################### -# Pipeline statistics # -####################### - - -@merge( - [ - stats_deduplication_collate, - stats_digestion_collate, - stats_alignment_filtering_collate, - ], - "capcruncher_statistics/run_statistics.csv", -) -def pipeline_merge_stats(infiles, outfile): - - """ - Generates a summary statistics file for the pipeline run. - - """ - - df = pd.concat([pd.read_csv(fn) for fn in infiles]) - - df.sort_values( - ["sample", "read_type", "stat"], ascending=[True, True, False] - ).to_csv(outfile) - - -@merge( - [pipeline_merge_stats], - "capcruncher_statistics/capcruncher_statistics.html", -) -def pipeline_make_report(infile, outfile): - """Make pipeline run report""" - - statement = [ - "capcruncher", - "pipeline", - "report", - "--pipeline-statistics-path", - "capcruncher_statistics/", - "--pipeline-report-path", - outfile, - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_condaenv=P.PARAMS["conda_env"], - ) - - -##################### -# Reporter pileups # -##################### - - -@active_if( - (ANALYSIS_METHOD == "capture" or ANALYSIS_METHOD == "tri") - and not HAS_HIGH_NUMBER_OF_VIEWPOINTS -) -@follows( - mkdir("capcruncher_analysis/bedgraphs"), reporters_count, reporters_store_binned -) -@transform( - "capcruncher_analysis/reporters/counts/*.hdf5", - regex(r".*/(.*).hdf5"), - r"capcruncher_analysis/bedgraphs/\1.raw.sentinel", - extras=[r"\1"], -) -def reporters_make_bedgraph(infile, outfile, sample_name): - """Extract reporters in bedgraph format from stored interactions""" - - output_prefix = f"capcruncher_analysis/bedgraphs/{sample_name}.raw" - - statement = [ - "capcruncher", - "reporters", - "pileup", - infile, - "-o", - output_prefix, - "--normalisation", - "raw", - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - ) - - touch_file(outfile) - - -@follows(reporters_count, reporters_store_binned) -@active_if( - (ANALYSIS_METHOD == "capture" or ANALYSIS_METHOD == "tri") - and not HAS_HIGH_NUMBER_OF_VIEWPOINTS -) -@transform( - "capcruncher_analysis/reporters/counts/*.hdf5", - regex(r".*/(.*).hdf5"), - r"capcruncher_analysis/bedgraphs/\1.normalised.sentinel", - extras=[r"\1"], -) -def reporters_make_bedgraph_normalised(infile, outfile, sample_name): - """ - Extract reporters in bedgraph format from stored interactions. - - In addition to generating a bedgraph this task also normalises the counts - by the number of cis interactions identified to enable cross sample comparisons. - - """ - - output_prefix = f"capcruncher_analysis/bedgraphs/{sample_name}.normalised" - - norm_regions = P.PARAMS.get("normalisation_regions", "NOT_PROVIDED") - norm_by_region = os.path.exists(norm_regions) - - statement = [ - "capcruncher", - "reporters", - "pileup", - infile, - "-o", - output_prefix, - "--normalisation", - "n_cis" if not norm_by_region else "region", - "--normalisation-regions" if norm_by_region else " ", - norm_regions if norm_by_region else " ", - "--scale_factor", - str(P.PARAMS.get("normalisation_scale_factor", 1000000)), - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - ) - - touch_file(outfile) - - -@active_if(N_SAMPLES >= 2) -@active_if(ANALYSIS_METHOD == "capture" or ANALYSIS_METHOD == "tri") -@follows( - mkdir("capcruncher_compare/bedgraphs_union"), - reporters_make_bedgraph, - reporters_make_bedgraph_normalised, -) -@collate( - "capcruncher_analysis/bedgraphs/*.bedgraph", - regex(r".*/(?:.*)\.(raw|normalised|windowed)\.(.*).bedgraph"), - r"capcruncher_compare/bedgraphs_union/\2.\1.tsv", - extras=[r"\1", r"\2"], -) -def reporters_make_union_bedgraph(infiles, outfile, normalisation_type, capture_name): - - """ - Collates bedgraphs by capture probe into a single file for comparison. - - See `bedtools unionbedg `_ - for more details. - - """ - sample_names = [ - re.match(r".*/(.*)\.(.*)\.(?:.*).bedgraph(?:.gz)?", fn).group(1) - for fn in infiles - ] - - statement = [ - "bedtools", - "unionbedg", - "-i", - " ".join(infiles), - "-header", - "-names", - " ".join(sample_names), - ">", - outfile, - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - without_cluster=True, - ) - - -@active_if(N_SAMPLES >= 2) -@active_if(ANALYSIS_METHOD == "capture" or ANALYSIS_METHOD == "tri") -@follows( - mkdir("capcruncher_compare/bedgraphs_comparison/"), reporters_make_union_bedgraph -) -@transform( - reporters_make_union_bedgraph, - regex(r"capcruncher_compare/bedgraphs_union/(.*)\.normalised\.tsv"), - r"capcruncher_compare/bedgraphs_comparison/\1.sentinel", - extras=[r"\1"], -) -def reporters_make_comparison_bedgraph(infile, outfile, viewpoint): - - df_bdg = pd.read_csv(infile, sep="\t", nrows=10) - output_prefix = f"{os.path.dirname(outfile)}/" - - summary_methods = [ - m - for m in re.split(r"[,;\s+]", P.PARAMS.get("compare_summary_methods", "mean,")) - if m - ] - - if not HAS_DESIGN: - # Need to generate a design matrix if one does not exist - samples = df_bdg.columns[3:] - condition = ["_".join(sample.split("_")[:-1]) for sample in samples] - df_design = pd.DataFrame() - df_design["sample"] = samples - df_design["condition"] = condition - else: - df_design = pd.read_csv( - P.PARAMS["analysis_design"], sep=r"\s+|;|:\t|,", engine="python" - ) - - groups = df_design.groupby("condition") - - statement = [ - "capcruncher", - "reporters", - "compare", - "summarise", - infile, - "-o", - output_prefix, - "-f", - "bedgraph", - *[f"-m {m}" for m in summary_methods], - *[f"-n {n}" for n, df in groups], - *[f"-c {','.join([str(s) for s in df['sample']])}" for name, df in groups], - "--subtraction", - "--suffix", - f".{viewpoint}", - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - without_cluster=False, - ) - - touch_file(outfile) - - -@active_if(ANALYSIS_METHOD == "capture" or ANALYSIS_METHOD == "tri") -@follows( - mkdir("capcruncher_analysis/bigwigs"), - reporters_make_bedgraph, - reporters_make_bedgraph_normalised, - reporters_make_comparison_bedgraph, -) -@transform( - [ - "capcruncher_analysis/bedgraphs/*", - "capcruncher_compare/bedgraphs_comparison/*.bedgraph", - ], - regex(r".*/(.*).bedgraph"), - r"capcruncher_analysis/bigwigs/\1.bigWig", -) -def reporters_make_bigwig(infile, outfile): - """Uses UCSC tools bedGraphToBigWig to generate bigWigs for each bedgraph""" - - tmp = f"{outfile}.tmp" - statement_sort = " ".join(["sort", "-k1,1", "-k2,2n", infile, ">", tmp]) - statement_bdgtobw = " ".join( - ["bedGraphToBigWig", tmp, P.PARAMS["genome_chrom_sizes"], outfile] - ) - statement_cleanup = " ".join(["rm", tmp]) - - P.run( - " && ".join([statement_sort, statement_bdgtobw, statement_cleanup]), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - without_cluster=True, - ) - - -####################### -# UCSC hub generation # -####################### - - -@mkdir("capcruncher_analysis/viewpoints/") -@transform( - annotate_sort_viewpoints, - regex(r".*/(.*).bed"), - r"capcruncher_analysis/viewpoints/\1.bigBed", -) -def viewpoints_to_bigbed(infile, outfile): - - statement = ["bedToBigBed", infile, P.PARAMS["genome_chrom_sizes"], outfile] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - ) - - -@active_if(MAKE_HUB and (ANALYSIS_METHOD == "capture" or ANALYSIS_METHOD == "tri")) -@merge( - [reporters_make_bigwig, viewpoints_to_bigbed, pipeline_make_report], - os.path.join( - P.PARAMS.get("hub_dir", ""), P.PARAMS.get("hub_name", "") + ".hub.txt" - ), -) -def hub_make(infiles, outfile): - """Creates a ucsc hub from the pipeline output""" - - import trackhub - import seaborn as sns - from capcruncher.utils import categorise_tracks - - # Extract statistics - stats_report = [fn for fn in infiles if fn.endswith(".html")][0] - - # Create a dataframe with bigwig attributes and paths - bigwigs = [fn for fn in infiles if fn.endswith(".bigWig")] - df_bigwigs = ( - pd.Series(bigwigs) - .to_frame("fn") - .assign(basename=lambda df: df["fn"].apply(os.path.basename)) - ) - attributes = df_bigwigs["basename"].str.extract( - r"(?P.*?)\.(?P.*?)\.(?P.*?)\.(?P.*)" - ) - - df_bigwigs = ( - df_bigwigs.join(attributes) - .assign(track_categories=lambda df: categorise_tracks(df["method"])) - .sort_values(["samplename", "method", "viewpoint"]) - ) - - # Create a hub - hub = trackhub.Hub( - hub=HUB_NAME, - short_label=P.PARAMS.get("hub_short", HUB_NAME), - long_label=P.PARAMS.get("hub_long", HUB_NAME), - email=P.PARAMS["hub_email"], - ) - - ## Need to make an assembly hub if this is a custom genome - if P.PARAMS.get("genome_custom"): - genome = trackhub.Assembly( - genome=P.PARAMS["genome_name"], - twobit_file=P.PARAMS["genome_twobit"], - organism=P.PARAMS["genome_organism"], - defaultPos=P.PARAMS.get("hub_default_position", "chr1:1000-2000"), - ) - groups_file = trackhub.GroupsFile( - [ - trackhub.GroupDefinition( - name=HUB_NAME, priority=1, default_is_closed=False - ), - ] - ) - genome.add_groups(groups_file) - - else: - genome = trackhub.Genome(P.PARAMS["genome_name"]) - groups_file = None - - # Create genomes file - genomes_file = trackhub.GenomesFile() - - # Create trackdb - trackdb = trackhub.TrackDb() - - # Add these to the hub - hub.add_genomes_file(genomes_file) - genome.add_trackdb(trackdb) - genomes_file.add_genome(genome) - - # Extract groups for generating composite tracks - unique_samples = df_bigwigs["samplename"].unique() - unique_viewpoints = df_bigwigs["viewpoint"].unique() - unique_comparison_methods = df_bigwigs["method"].unique() - - subgroup_vp = trackhub.SubGroupDefinition( - name="viewpoint", - label="Viewpoint", - mapping={n.lower(): n for n in unique_viewpoints}, - ) - subgroup_sample = trackhub.SubGroupDefinition( - name="samplename", - label="Sample_Name", - mapping={n.lower(): n for n in unique_samples}, - ) - subgroup_method = trackhub.SubGroupDefinition( - name="summary_method", - label="Summary_Method", - mapping={n.split("-")[0]: n.split("-")[0] for n in unique_comparison_methods}, - ) - - # Generate a color mapping based on sample names - colors = sns.color_palette("hls", len(unique_samples)) - color_mapping = dict(zip(unique_samples, colors)) - - ##################### - # Add tracks to hub # - ##################### - - for category_name, df in df_bigwigs.groupby("track_categories"): - - composite = trackhub.CompositeTrack( - name=category_name, - short_label=category_name, - dimensions="dimX=samplename dimY=viewpoint dimA=summary_method", - sortOrder="samplename=+ viewpoint=+ summary_method=+", - tracktype="bigWig", - visibility="hide", - dragAndDrop="subTracks", - allButtonPair="off", - ) - - # Only add a group if this is an assembly hub - if groups_file: - composite.add_params(group=HUB_NAME) - - composite.add_subgroups([subgroup_vp, subgroup_sample, subgroup_method]) - # composite.add_params(html=os.path.basename(stats_report)) - - for bw in df.itertuples(): - t = trackhub.Track( - name=f'{bw.samplename}_{bw.viewpoint}_{bw.method.replace("-summary", "")}', - source=bw.fn, - autoScale="off", - tracktype="bigWig", - windowingFunction="maximum", - subgroups={ - "viewpoint": bw.viewpoint.lower(), - "samplename": bw.samplename.lower(), - "summary_method": bw.method.split("-")[0], - }, - color=",".join( - [str(int(x * 255)) for x in color_mapping[bw.samplename]] - ), - ) - - # Only add a group if this is an assembly hub - if groups_file: - t.add_params(group=HUB_NAME) - - composite.add_subtrack(t) - - trackdb.add_tracks(composite) - - # Add viewpoints to hub - for bb in [fn for fn in infiles if fn.endswith(".bigBed")]: - - t = trackhub.Track( - name=os.path.basename(bb).replace(".bigBed", "").capitalize(), - source=bb, - tracktype="bigBed", - ) - - if genomes_file: - t.add_params(group=HUB_NAME) - - trackdb.add_tracks(t) - - ############# - # Stage hub # - ############# - - staging_tmp_dir = "hub_tmp_dir" - - # Stage the hub - trackhub.upload.stage_hub(hub=hub, staging=staging_tmp_dir) - - # Edit the hub.txt file to include the stats report as descriptionUrl - with open(os.path.join(staging_tmp_dir, f"{HUB_NAME}.hub.txt"), "a") as hubtxt: - hubtxt.write("\n") - hubtxt.write( - f'descriptionUrl {P.PARAMS["genome_name"]}/{os.path.basename(stats_report)}\n' - ) - - # Copy to the new location - shutil.copytree( - "hub_tmp_dir", - P.PARAMS["hub_dir"], - dirs_exist_ok=True, - symlinks=P.PARAMS.get("hub_symlink", False), - ) - - # Delete the staged hub - shutil.rmtree("hub_tmp_dir") - - # Copy the stats report to the correct location - shutil.copy( - stats_report, - os.path.join( - P.PARAMS["hub_dir"], P.PARAMS["genome_name"], os.path.basename(stats_report) - ), - ) - - -###################################### -# Identify differential interactions # -###################################### - - -@active_if(False) -@active_if(N_SAMPLES >= 4) -@follows(mkdir("capcruncher_compare/differential")) -@transform( - reporters_make_union_bedgraph, - regex(r".*/(.*)\.raw\.tsv"), - r"capcruncher_compare/differential/\1.sentinel", - extras=[r"\1"], -) -def identify_differential_interactions(infile, outfile, capture_name): - - if len(pd.read_csv(infile, sep="\t", nrows=5).columns) >= 4: - - output_prefix = outfile.replace(".sentinel", "") - - statement = [ - "capcruncher", - "reporters", - "differential", - infile, - "-n", - capture_name, - "-c", - P.PARAMS["analysis_viewpoints"], - "-o", - output_prefix, - ] - - P.run( - " ".join(statement), - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_threads=1, - job_condaenv=P.PARAMS["conda_env"], - ) - - else: - logging.warn("Not enough replicates for differential testing") - - touch_file(outfile) - - -################## -# Plot reporters # -################## - - -@follows(reporters_store_binned, mkdir("capcruncher_plots/templates")) -@active_if(ANALYSIS_METHOD in ["tri", "tiled"] and MAKE_PLOTS and PERFORM_BINNING) -@merge( - "capcruncher_analysis/reporters/counts/*.hdf5", - r"capcruncher_plots/templates/heatmaps.sentinel", -) -def plot_heatmaps_make_templates(infiles, outfile): - - # Need to make a template for each viewpoint - df_viewpoints = BedTool(P.PARAMS["analysis_viewpoints"]).to_dataframe() - - genes = P.PARAMS.get("plot_genes") - has_genes_to_plot = os.path.exists(genes) - - statements = list() - for viewpoint in df_viewpoints["name"].unique(): - for bin_size in BINSIZES: - statements.append( - " ".join( - [ - "capcruncher", - "plot", - "make-template", - *infiles, - genes if has_genes_to_plot else "", - "-v", - viewpoint, - "-b", - str(bin_size), - "-o", - outfile.replace("heatmaps.sentinel", f"{viewpoint}_resolution_{bin_size}.heatmap.yml"), - ] - ) - ) - - P.run( - statements, - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_condaenv=P.PARAMS["conda_env"], - job_threads=1, - without_cluster=True, - ) - - touch_file(outfile) - - -@follows(reporters_make_bigwig, mkdir("capcruncher_plots/templates")) -@active_if(ANALYSIS_METHOD in ["capture", "tri"] and MAKE_PLOTS) -@collate( - "capcruncher_analysis/bigwigs/*.bigWig", - regex(r".*/.*?\.normalised\.(.*?)\.bigWig"), - r"capcruncher_plots/templates/\1.pileup.yml", -) -def plot_pileups_make_templates(infiles, outfile): - - genes = P.PARAMS.get("plot_genes") - has_genes_to_plot = os.path.exists(genes) - design_matrix = P.PARAMS.get("analysis_design") - - statements = list() - statements.append( - " ".join( - [ - "capcruncher", - "plot", - "make-template", - *infiles, - f"--design_matrix {design_matrix}" if HAS_DESIGN else "", - genes if has_genes_to_plot else "", - "--output_prefix", - outfile.replace(".yml", ""), - ] - ) - ) - - P.run( - statements, - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_condaenv=P.PARAMS["conda_env"], - job_threads=1, - without_cluster=True, - ) - - touch_file(outfile) - - -@follows(plot_heatmaps_make_templates, plot_pileups_make_templates) -@active_if(MAKE_PLOTS) -@transform( - "capcruncher_plots/templates/*.yml", - regex(r".*/(.*)\.(.*).yml"), - r"capcruncher_plots/templates/\1.sentinel", - extras=[r"\1"], -) -def make_plots(infile, outfile, viewpoint): - - try: - - regions_to_plot = BedTool(P.PARAMS.get("plot_coordinates")) - statements = [] - - for region in regions_to_plot: - if viewpoint in region: - - coordinates = f"{region.chrom}:{region.start}-{region.end}" - - statements.append( - " ".join( - [ - "capcruncher", - "plot", - "make-plot", - "-c", - infile, - "-r", - coordinates, - "-o", - f"capcruncher_plots/{region.name}_{coordinates}.svg", - "--x-axis", - ] - ) - ) - except Exception as e: - logging.warning(f"Exception {e} occured while plotting {region.name}") - - P.run( - statements, - job_queue=P.PARAMS["pipeline_cluster_queue"], - job_condaenv=P.PARAMS["conda_env"], - job_threads=1, - without_cluster=True, - ) - - touch_file(outfile) - - -@follows(plot_heatmaps_make_templates, plot_pileups_make_templates, make_plots) -def plotting(): - pass - - -@follows( - pipeline_make_report, - hub_make, - reporters_make_union_bedgraph, - identify_differential_interactions, - reporters_make_comparison_bedgraph, - plotting, -) -@originate( - "COMPLETE.sentinel", -) -def full(outfile): - - if os.path.exists("chrom_sizes.txt.tmp"): - os.unlink("chrom_sizes.txt.tmp") - - if os.path.exists("capcruncher_analysis/reporters/binners.pkl"): - zap_file("capcruncher_analysis/reporters/binners.pkl") - - touch_file(outfile) - - -if __name__ == "__main__": - - if ( - "-h" in sys.argv or "--help" in sys.argv - ): # If --help then just run the pipeline without setup - P.main(sys.argv) - elif not "make" in sys.argv: - P.main(sys.argv) - else: - check_config() - set_up_chromsizes() - check_user_supplied_paths() - P.main(sys.argv) diff --git a/capcruncher/pipeline/utils.py b/capcruncher/pipeline/utils.py new file mode 100644 index 00000000..4b9470b1 --- /dev/null +++ b/capcruncher/pipeline/utils.py @@ -0,0 +1,488 @@ +import os +import pathlib +import re +from typing import Dict, List, Union, Literal +from collections import defaultdict +import json +import itertools +import pandas as pd +import pyranges as pr + +from capcruncher import utils + +from loguru import logger +import snakemake +from snakemake.io import expand, glob_wildcards + + +def is_on(param: str) -> bool: + """ + Returns True if parameter in "on" values + On values: + - true + - t + - on + - yes + - y + - 1 + """ + values = ["true", "t", "on", "yes", "y", "1"] + if str(param).lower() in values: + return True + else: + return False + + +def is_off(param: str): + """Returns True if parameter in "off" values""" + values = ["", "None", "none", "F", "f"] + if str(param).lower() in values: + return True + else: + return False + + +def is_none(param: str) -> bool: + """Returns True if parameter is none""" + values = ["", "none"] + if str(param).lower() in values: + return True + else: + return False + + +def convert_empty_yaml_entry_to_string(param: str) -> str: + """ + Converts empty yaml entries to string + """ + if is_none(param): + return "" + else: + return param + + +def format_config_dict(config: Dict) -> Dict: + """ + Formats the config dictionary to ensure that all entries are strings. + + """ + for key, value in config.items(): + config[key] = convert_empty_yaml_entry_to_string(value) + + return config + + +def get_design_matrix(fastqs: List[Union[str, pathlib.Path]]): + df = pd.DataFrame(fastqs, columns=["fn"]) + df["filename"] = df["fn"].apply(str).str.split(".fastq").str[0] + df["sample"] = df["filename"].str.extract(r".*/(.*?)_R?[12].fastq.*") + df["condition"] = df["sample"].str.split(".fastq").str[0].str.split("_").str[-1] + + if df["condition"].isna().any(): + logger.warn( + "Failed to identify conditions from fastq files. Please format as sample_CONDITION_READ.fastq(.gz)" + ) + df["condition"].fillna("UNKNOWN") + + return df[["sample_name", "condition"]].drop_duplicates() + + +def get_bin_sizes(config): + binsizes = config["analysis"].get("bin_sizes") + + if binsizes is None: + binsizes = [] + elif isinstance(binsizes, int): + binsizes = [binsizes] + elif isinstance(binsizes, str): + binsizes = [ + int(bs) + for bs in re.split( + r"[,;]\s*|\s+", str(config["analysis"].get("bin_sizes", "0")) + ) + ] + elif isinstance(binsizes, list): + binsizes = [int(bs) for bs in binsizes] + else: + raise ValueError( + "Invalid bin size(s). Please specify as int, comma separated string or list of ints." + ) + + return binsizes + + +def get_blacklist(config): + if utils.is_valid_bed(config["analysis_optional"].get("blacklist"), verbose=False): + blacklist = config["analysis_optional"]["blacklist"] + return blacklist + + +def has_high_viewpoint_number(viewpoints: str, config: Dict): + n_viewpoints = pr.read_bed(viewpoints).df.shape[0] + if n_viewpoints > 100: + if not config["analysis_optional"].get("force_bigwig_generation", False): + return True + + +def can_perform_plotting(config): + try: + pass + except ImportError: + logger.warning( + "Plotting capabilities not installed. For plotting please run: pip install capcruncher[plotting]" + ) + return False + + return utils.is_valid_bed(config["plot"].get("coordinates"), verbose=False) + + +def can_perform_binning(config): + perform_binning = False + + try: + bin_sizes = config["analysis"].get("bin_sizes", None) + if isinstance(bin_sizes, int) and bin_sizes > 0: + perform_binning = True + elif isinstance(bin_sizes, list) and all([int(bs) > 0 for bs in bin_sizes]): + perform_binning = True + + return perform_binning + + except Exception as e: + logger.error(e) + return perform_binning + + +def group_files_by_regex(files: List, regex: str): + df = pd.DataFrame(files, columns=["fn"]) + extracted_substrings = df["fn"].astype(str).str.extract(regex) + df = df.join(extracted_substrings) + return ( + df.groupby(extracted_substrings.columns.to_list()) + .agg(list)["fn"] + .rename("files_grouped") + ) + + +class FastqSamples: + def __init__(self, design): + # Expected columns: sample, fq1, fq2 + self._design = design + self._design = self._design.assign( + paired=(~self._design[["fq1", "fq2"]].isna().any(axis=1)) + ) + + @classmethod + def from_files(cls, files: List[Union[pathlib.Path, str]]) -> "FastqSamples": + if not len(files) > 0: + logger.error("No fastq files found.") + raise ValueError("No fastq files found.") + + df = pd.DataFrame(files, columns=["fn"]) + + df[["sample", "read"]] = ( + df["fn"].apply(str).str.extract("(?!.*/)?(.*).*_R?([12]).fastq.gz") + ) + + df["sample"] = df["sample"].apply( + lambda p: pathlib.Path(p).name + if isinstance(p, pathlib.Path) + else os.path.basename(p) + ) + df["read"] = "fq" + df["read"] + + df = ( + df.pivot(columns="read", index=["sample"]) + .droplevel(level=0, axis=1) + .reset_index() + ) + + # Format to check for + # CONDITION-A_REPLICATE-IDENTIFIER_READNUMBER + try: + df[["condition", "replicate"]] = df["sample"].str.split("_", expand=True) + except ValueError: + logger.warning("Failed to identify conditions from fastq files.") + df["condition"] = "UNKNOWN" + + return cls(design=df) + + @property + def fastq_files(self): + return sorted([*self._design["fq1"], *self._design["fq2"]]) + + @property + def sample_names_all(self): + return self._design["sample"].to_list() + + @property + def translation(self): + fq_translation = {} + for sample in self._design.itertuples(): + for read in [1, 2]: + fq_translation[f"{sample.sample}_{read}.fastq.gz"] = os.path.realpath( + str(getattr(sample, f"fq{read}")) + ) + + return fq_translation + + @property + def design(self): + return self._design + + +def validate_blacklist(blacklist): + """Validate blacklist file.""" + + blacklist_ok = True + + if blacklist is None: + blacklist_ok = False + elif not os.path.exists(blacklist): + blacklist_ok = False + + return blacklist_ok + + +def configure_annotation_parameters(workflow: snakemake.Workflow, config: Dict) -> Dict: + """Load defaults from annotation_defaults.json and overwrite with the current files""" + + path = pathlib.Path(__file__).absolute() + defaults_file = path.parent / "workflow" / "data" / "annotation_defaults.json" + parameters = json.load(open(defaults_file)) + + # Overwrite defaults with current options + parameters["viewpoints"]["fn"] = config["analysis"]["viewpoints"] + parameters["viewpoints"]["fraction"] = config["analysis_optional"].get( + "minimum_viewpoint_overlap", parameters["viewpoints"]["fraction"] + ) + parameters["viewpoints_count"]["fn"] = config["analysis"]["viewpoints"] + parameters["viewpoints_count"]["fraction"] = config["analysis_optional"].get( + "minimum_viewpoint_overlap", parameters["viewpoints"]["fraction"] + ) + + # Check if blacklist is valid + if validate_blacklist(config["analysis"].get("blacklist")): + parameters["blacklist"] = config["analysis"]["blacklist"] + else: + del parameters["blacklist"] + + return parameters + + +def format_annotation_parameters(*args, **kwargs): + """Format annotation parameters for use in the shell script.""" + + parameters = configure_annotation_parameters(*args) + + flags = { + "name": "-n", + "fn": "-b", + "action": "-a", + "fraction": "-f", + "dtype": "-t", + } + + annotation_args = [] + for annotation, options in parameters.items(): + for option, value in options.items(): + if value is not None: + annotation_args.append(f"{flags[option]} {value}") + + return " ".join(annotation_args) + + +def format_priority_chromosome_list(config: Dict): + """Format priority chromosome list for use in the shell script.""" + + priority_chroms = config["analysis_optional"].get("priority_chromosomes", "") + + if not priority_chroms or priority_chroms == "None": + chromosomes = None + elif "," in priority_chroms: + chromosomes = priority_chroms + elif "viewpoints" in priority_chroms: + pr_viewpoints = pr.read_bed(config["analysis"]["viewpoints"]) + chromosomes = ",".join(pr_viewpoints.Chromosome) + + return f"--priority-chroms {chromosomes}" if chromosomes else "" + + +def get_threads_for_annotation(annotation_files_and_params): + return annotation_files_and_params.count("-b") + + +def identify_columns_based_on_condition(design: pd.DataFrame): + condition_args = [] + + for group_name, columns in design.groupby("condition").groups.items(): + condition_args.append(f"-c {','.join(str(c) for c in columns)}") + + condition_args_str = " ".join(condition_args) + + return condition_args_str + + +def validate_custom_filtering(config: Dict): + custom_filter_stages = config["analysis"].get("custom_filtering", "") + if not custom_filter_stages: + cf = "" + elif not os.path.exists(custom_filter_stages): + cf = "" + else: + cf = f"--custom-filtering {custom_filter_stages}" + + return cf + + +def get_count_files(wc, perform_binning: bool = False): + counts = [] + counts.append( + f"capcruncher_output/pileups/counts_by_restriction_fragment/{wc.sample}.hdf5" + ) + + if perform_binning: + counts.append( + f"capcruncher_output/pileups/counts_by_genomic_bin/{wc.sample}.hdf5" + ) + + return counts + + +def get_normalisation_from_config(wc, config: Dict): + regions = config["normalisation"]["regions"] + + if regions is not None or isinstance(regions, str): + if os.path.exists(regions): # noqa: E714 + return f"--normalisation region --normalisation-regions {regions}" + return "--normalisation n_cis" + + +def get_fastq_basename(wildcards, fastq_samples: FastqSamples, **kwargs): + return pathlib.Path( + fastq_samples.translation[f"{wildcards.sample}_{wildcards.read}.fastq.gz"] + ).stem.replace(".fastq", "") + + +def get_files_to_plot( + wc, + design: pd.DataFrame, + assay: Literal["capture", "tri", "tiled"], + sample_names: List[str], + summary_methods: List[str], + compare_samples: bool = False, + aggregate_samples: bool = False, +): + files = { + "bigwigs": [], + "subtractions": [], + "bigwigs_collection": [], + "heatmaps": [], + } + + if assay == "tiled": + files["heatmaps"].extend( + expand( + "capcruncher_output/results/{sample}/{sample}.hdf5", + sample=sample_names, + ) + ) + return files + + if compare_samples: + bigwigs_comparison = expand( + "capcruncher_output/results/comparisons/bigwigs/{comparison}.{method}-subtraction.{{viewpoint}}.bigWig", + comparison=[ + f"{a}-{b}" + for a, b in itertools.permutations(design["condition"].unique(), 2) + ], + method=summary_methods, + ) + + files["subtractions"].extend(bigwigs_comparison) + + bigwigs = expand( + "capcruncher_output/results/{sample}/bigwigs/norm/{sample}_{{viewpoint}}.bigWig", + sample=sample_names, + ) + + # if AGGREGATE_SAMPLES: + # files["bigwigs_collection"].extend(bigwigs) + # else: + # files["bigwigs"].extend(bigwigs) + + files["bigwigs"].extend(bigwigs) + + return files + + +def get_plotting_coordinates(wc, config: Dict): + plot_coords = config["plot"].get("coordinates", None) + + if plot_coords and pathlib.Path(plot_coords).exists(): + df = pd.read_table( + plot_coords, names=["chrom", "start", "end", "name"], header=None + ) + df = df.query("name.str.contains(@wc.viewpoint)").iloc[0] + + else: + df = pd.read_table( + config["analysis"]["viewpoints"], + names=["chrom", "start", "end", "name"], + header=None, + ) + + df = df.query("name == @wc.viewpoint").iloc[0] + + return f"{df.chrom}:{df.start}-{df.end}" + + +def get_pileups( + assay: Literal["capture", "tri", "tiled"], + design: pd.DataFrame, + samples_aggregate: bool, + samples_compare: bool, + sample_names: List[str], + summary_methods: List[str], + viewpoints: List[str], +) -> list[str]: + bigwigs = [] + if assay in ["capture", "tri"]: + bigwigs.extend( + expand( + "capcruncher_output/results/{sample}/bigwigs/{norm}/{sample}_{viewpoint}.bigWig", + sample=sample_names, + norm=["raw", "norm"], + viewpoint=viewpoints, + ), + ) + + if samples_aggregate: + bigwigs.extend( + expand( + "capcruncher_output/results/comparisons/bigwigs/{group}.{method}-summary.{viewpoint}.bigWig", + group=design["condition"].unique(), + method=summary_methods, + viewpoint=viewpoints, + ), + ) + + if samples_compare: + bigwigs.extend( + expand( + "capcruncher_output/results/comparisons/bigwigs/{comparison}.{method}-subtraction.{viewpoint}.bigWig", + comparison=[ + f"{a}-{b}" + for a, b in itertools.permutations( + design["condition"].unique(), 2 + ) + ], + method=summary_methods, + viewpoint=viewpoints, + ), + ) + + elif assay == "tiled": + pass + + return bigwigs diff --git a/capcruncher/pipeline/workflow/Snakefile b/capcruncher/pipeline/workflow/Snakefile new file mode 100644 index 00000000..e141f29f --- /dev/null +++ b/capcruncher/pipeline/workflow/Snakefile @@ -0,0 +1,184 @@ +import os +import sys +import pathlib +import shutil +import json + +import pandas as pd +import pyranges as pr +import snakemake.utils + +import capcruncher.pipeline.utils +import importlib.util +from typing import Literal +import itertools + + +snakemake.utils.min_version('7.19.1') + + +configfile: "capcruncher_config.yml" + + +container: "library://asmith151/capcruncher/capcruncher:latest" + + +# Pipeline set-up +capcruncher.pipeline.utils.format_config_dict(config) + +## Get fastq files +FASTQ_SAMPLES = capcruncher.pipeline.utils.FastqSamples.from_files( + list(pathlib.Path(".").glob("*.fastq*")) +) + +## Convert FASTQ files to design matrix +if os.path.exists(config["analysis"].get("design", None)): + DESIGN = pd.read_table( + config["analysis"]["design"], sep=r"\s+|,|\t", engine="python" + ) +else: + DESIGN = FASTQ_SAMPLES.design + +## Read viewpoints +VIEWPOINTS = config["analysis"]["viewpoints"] +VIEWPOINT_NAMES = pd.read_table( + VIEWPOINTS, names=["chrom", "start", "end", "name"], header=None +)["name"].to_list() + + +N_SAMPLES = DESIGN["sample"].nunique() +ANALYSIS_METHOD = config["analysis"].get("method", "capture") +BIN_SIZES = capcruncher.pipeline.utils.get_bin_sizes(config) +HIGH_NUMBER_OF_VIEWPOINTS = capcruncher.pipeline.utils.has_high_viewpoint_number( + VIEWPOINTS, config +) + + +# Details +SUMMARY_METHODS = [ + m + for m in re.split(r"[,;\s+]", config["compare"].get("summary_methods", "mean,")) + if m +] + + +## Optional +AGGREGATE_SAMPLES = DESIGN["sample"].nunique() > 1 +COMPARE_SAMPLES = DESIGN["condition"].nunique() > 1 +PERFORM_DIFFERENTIAL_ANALYSIS = ( + (config["differential"]["contrast"] in DESIGN.columns) + and COMPARE_SAMPLES + and (ANALYSIS_METHOD in ["capture", "tri"]) +) +PERFORM_PLOTTING = capcruncher.pipeline.utils.can_perform_plotting(config) +PERFORM_BINNING = capcruncher.pipeline.utils.can_perform_binning(config) + +## Pipeline variables +ASSAY = config["analysis"]["method"] +SAMPLE_NAMES = FASTQ_SAMPLES.sample_names_all + +# Optional + +## Check if capcruncher_tools is installed +if importlib.util.find_spec("capcruncher_tools"): + print("CapCruncherTools is installed. Enabling CapCruncherTools functions.") + CAPCRUNCHER_TOOLS = True +else: + CAPCRUNCHER_TOOLS = False + +CLEANUP = "full" if config["analysis"].get("cleanup", False) else "partial" + + +include: "rules/digest.smk" +include: "rules/fastq.smk" +include: "rules/qc.smk" +include: "rules/align.smk" +include: "rules/annotate.smk" +include: "rules/filter.smk" +include: "rules/pileup.smk" +include: "rules/compare.smk" +include: "rules/statistics.smk" +include: "rules/visualise.smk" + + +wildcard_constraints: + sample="|".join(SAMPLE_NAMES), + part=r"\d+", + viewpoint="|".join(VIEWPOINT_NAMES), + + +rule all: + input: + qc=rules.multiqc.output[0], + report="capcruncher_output/results/capcruncher_report.html", + pileups=capcruncher.pipeline.utils.get_pileups( + assay=ASSAY, + design=DESIGN, + samples_aggregate=AGGREGATE_SAMPLES, + samples_compare=COMPARE_SAMPLES, + sample_names=SAMPLE_NAMES, + summary_methods=SUMMARY_METHODS, + viewpoints=VIEWPOINT_NAMES, + ), + counts=expand( + "capcruncher_output/results/{sample}/{sample}.hdf5", + sample=SAMPLE_NAMES, + ), + hub=rules.create_ucsc_hub.output[0] + if ANALYSIS_METHOD in ["capture", "tri"] + else [], + differential=expand( + "capcruncher_output/results/differential/{viewpoint}", + viewpoint=VIEWPOINT_NAMES, + ) + if PERFORM_DIFFERENTIAL_ANALYSIS + else [], + plots=expand( + "capcruncher_output/results/figures/{viewpoint}.pdf", + viewpoint=VIEWPOINT_NAMES, + ) + if PERFORM_PLOTTING + else [], + + +onerror: + log_out = "capcruncher_error.log" + shutil.copyfile(log, log_out) + print( + f"An error occurred. Please check the log file {log_out} for more information." + ) + + +onsuccess: + log_out = "capcruncher.log" + shutil.copyfile(log, log_out) + print(f"Pipeline completed successfully. See {log_out} for more information.") + + if CLEANUP == "full": + shutil.rmtree("capcruncher_output/interim/") + + elif CLEANUP == "partial" and pathlib.Path("capcruncher_output/interim/").exists(): + import subprocess + + files_to_remove = [] + # Split files + for sample_name in SAMPLE_NAMES: + split_dir = f"capcruncher_output/interim/fastq/split/{sample_name}" + flashed_dir = f"capcruncher_output/interim/fastq/flashed/{sample_name}" + rebalanced_dir = ( + f"capcruncher_output/interim/fastq/rebalanced/{sample_name}" + ) + files_to_remove.extend( + pathlib.Path(split_dir).glob(f"{sample_name}_part*.fastq.gz") + ) + files_to_remove.extend( + pathlib.Path(flashed_dir).glob(f"{sample_name}_part*.fastq.gz") + ) + for combined in ["flashed", "pe"]: + files_to_remove.extend( + (pathlib.Path(rebalanced_dir) / combined).glob( + f"{sample_name}_part*.fastq.gz" + ) + ) + for f in files_to_remove: + f.unlink() diff --git a/capcruncher/pipeline/workflow/data/annotation_defaults.json b/capcruncher/pipeline/workflow/data/annotation_defaults.json new file mode 100644 index 00000000..044c2e75 --- /dev/null +++ b/capcruncher/pipeline/workflow/data/annotation_defaults.json @@ -0,0 +1,38 @@ +{ + "restriction_fragment": { + "name": "restriction_fragment", + "fn": "capcruncher_output/resources/restriction_fragments/genome.digest.bed.gz", + "action": "get", + "fraction": 0.51 + }, + "viewpoints": { + "name": "capture", + "fn": "capcruncher_analysis/annotations/viewpoints.bed", + "action": "get", + "fraction": 0.75 + }, + "exclusions": { + "name": "exclusion", + "fn": "capcruncher_analysis/annotations/exclude.bed", + "action": "get", + "fraction": 1e-9 + }, + "exclusions_count": { + "name": "exclusion_count", + "fn": "capcruncher_analysis/annotations/exclude.bed", + "action": "count", + "fraction": 1e-9 + }, + "viewpoints_count": { + "name": "capture_count", + "fn": "capcruncher_analysis/annotations/viewpoints.bed", + "action": "count", + "fraction": 0.75 + }, + "blacklist": { + "name": "blacklist", + "fn": "capcruncher_analysis/annotations/blacklist.bed", + "action": "count", + "fraction": 1e-9 + } +} diff --git a/capcruncher/pipeline/workflow/envs/environment.yml b/capcruncher/pipeline/workflow/envs/environment.yml new file mode 100644 index 00000000..cd683979 --- /dev/null +++ b/capcruncher/pipeline/workflow/envs/environment.yml @@ -0,0 +1,35 @@ +name: ngs +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - python>=3.8 + - pip + - pip: + - wget + - lanceotron + - git+https://github.com/alsmith151/ucsc_hub_maker.git + - bowtie2 + - samtools>1.7 + - deeptools + - trim-galore + - fastqc + - multiqc + - trackhub + - seaborn + - click + - cookiecutter + - ucsc-bigbedtobed + - pybigwig + - picard-slim + - macs2 + - ucsc-fetchchromsizes + - ucsc-bedtobigbed + - ucsc-bedgraphtobigwig + - homer + - pybedtools + - snakemake + - subread + - star + - numpy>=1.19 diff --git a/capcruncher/pipeline/workflow/envs/profiles/profile_drmaa_singularity/config.yaml b/capcruncher/pipeline/workflow/envs/profiles/profile_drmaa_singularity/config.yaml new file mode 100644 index 00000000..ecb377e8 --- /dev/null +++ b/capcruncher/pipeline/workflow/envs/profiles/profile_drmaa_singularity/config.yaml @@ -0,0 +1,11 @@ +jobname: smk-{jobid}-{rule}-{wildcards} +drmaa: --cpus-per-task={threads} --mem-per-cpu={resources.mem_mb} --time=24:00:00 +use-singularity: true +singularity-args: -B /ceph -B /databank -B /datashare +jobs: 50 +keep-going: True +rerun-incomplete: True +printshellcmds: True +latency-wait: 30 +show-failed-logs: True +cores: 8 diff --git a/capcruncher/pipeline/workflow/envs/profiles/profile_singularity/config.yaml b/capcruncher/pipeline/workflow/envs/profiles/profile_singularity/config.yaml new file mode 100644 index 00000000..7765784f --- /dev/null +++ b/capcruncher/pipeline/workflow/envs/profiles/profile_singularity/config.yaml @@ -0,0 +1,6 @@ +jobname: smk-{jobid}-{rule}-{wildcards} +use-singularity: true +singularity-args: -B /ceph -B /databank -B /datashare +keep-going: True +rerun-incomplete: True +printshellcmds: True diff --git a/capcruncher/pipeline/workflow/report/capcruncher_report.qmd b/capcruncher/pipeline/workflow/report/capcruncher_report.qmd new file mode 100644 index 00000000..fd88e4e9 --- /dev/null +++ b/capcruncher/pipeline/workflow/report/capcruncher_report.qmd @@ -0,0 +1,533 @@ +--- +title: "CapCruncher Run Report" +author: "CapCruncher" +date: today +format: + html: + toc: true + theme: cosmo + embed-resources: true +execute: + echo: false +--- + + +```{python} + +import os +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +from plotly.subplots import make_subplots +import yaml +import pathlib + + +def plot_deduplication_stats(deduplication_summary_path: os.PathLike): + + df = ( + pd.read_csv(deduplication_summary_path) + .sort_values("sample") + .replace("read_pairs_unique", "Unique Reads") + .replace("read_pairs_duplicated", "Duplicated Reads") + .loc[lambda df: df["stat_type"] != "read_pairs_total"] + ) + + fig = px.bar( + data_frame=df.query('stat_type != "reads_total"'), + x="stat", + y="sample", + color="stat_type", + template="plotly_white", + category_orders={ + "sample": sorted(df["sample"].unique()), + "stat_type": ["Unique Reads", "Duplicated Reads"], + }, + color_discrete_sequence=["#F9A65A", "grey"], + ) + # fig.for_each_trace(lambda t: t.update(name=" ".join(t.name.split("_")))) + fig.update_layout(legend_title_text="") + fig.update_yaxes(title="") + fig.update_xaxes(title="Number of Reads") + fig.update_traces(marker_line_width=0) + + return fig + + +def plot_trimming_summary(trimming_summary_path: os.PathLike): + + df = pd.read_csv(trimming_summary_path) + n_samples = len(df["sample"].unique()) + + df_summary = df.query( + 'stat_type == "adapters_removed" or stat_type == "reads_total"' + ).sort_values(["sample", "read_number"]) + + subplot_specs = [[{"type": "pie"} for i in range(2)] for j in range(n_samples)] + fig = make_subplots( + rows=n_samples, + cols=2, + specs=subplot_specs, + row_titles=sorted(df_summary["sample"].str.replace("_", " ").unique()), + column_titles=["Read 1", "Read 2"], + ) + + for ii, (sample, df_sample) in enumerate(df_summary.groupby("sample")): + for jj in range(0, 2): + + df_read_number = df_sample.query(f"read_number == {jj+1}") + + fig.add_trace( + go.Pie( + labels=df_read_number["stat_type"] + .str.replace("_", " ") + .str.title(), + values=df_read_number["stat"], + name=f"{sample} {jj+1}", + domain={ + "row": 1, + }, + ), + row=ii + 1, + col=jj + 1, + ) + + return fig + + +def format_run_stats_for_flash_figure( + run_stats_path: os.PathLike, +) -> pd.DataFrame: + + df = pd.read_csv(run_stats_path) + df_summary = ( + df.loc[df["stage"].isin(["digestion"])] + .loc[lambda df: df["stat_type"] == "unfiltered"] + .assign( + read_type=lambda df: df["read_type"] + .replace("flashed", "Combined") + .replace("pe", "Not Combined") + ) + .groupby(["sample", "stage", "stat_type", "read_type"])["stat"] + .mean() + .reset_index() + .sort_values("sample") + ) + + return df_summary + + +def plot_flash_summary(run_stats_path: os.PathLike): + + df = format_run_stats_for_flash_figure(run_stats_path) + fig = px.bar( + df, + x="stat", + y="read_type", + color="read_type", + animation_frame="sample", + range_x=[0, df["stat"].max()], + template="plotly_white", + color_discrete_sequence=["#599AD3", "#9E66AB"], + ) + + fig.update_xaxes(title="Number of Read Pairs") + fig.update_yaxes(title="") + fig.update_layout(legend_title_text="") + fig.update_traces(width=0.5) + + try: + fig["layout"]["updatemenus"] = None + # fig["layout"]["updatemenus"][0].update(dict(y=1.2, pad={"b": 10, "t": 0, "l": 0}, x=0)) + fig["layout"]["sliders"][0]["pad"] = {"b": 10, "t": 25} + fig["layout"]["sliders"][0]["x"] = 0 + fig["layout"]["sliders"][0]["len"] = 1 + + except (KeyError, IndexError): # Might only have one sample + pass + + return fig + + +def format_digestion_stats_at_read_level( + digestion_stats_reads_path: os.PathLike, +): + + df = pd.read_csv(digestion_stats_reads_path) + + df = df.query("read_number != 2").assign( + read_type=lambda df: df["read_type"] + .replace("flashed", "Combined") + .replace("pe", "Non-Combined"), + stat_type=lambda df: df["stat_type"] + .replace("unfiltered", "All Read Pairs") + .replace("filtered", "Read Pairs With Valid Slices"), + sample=lambda df: df["sample"].str.replace("_", " "), + ) + return df.sort_values("sample") + + +def plot_digestion_read_summary(digestion_stats_reads_path): + + df = format_digestion_stats_at_read_level(digestion_stats_reads_path) + fig = px.bar( + data_frame=df, + x="stat", + y="stat_type", + color="read_type", + animation_frame="sample", + template="plotly_white", + range_x=[0, df.groupby(["sample", "stat_type"])["stat"].sum().max()], + category_orders={ + "sample": sorted(df["sample"]), + "read_type": ["Combined", "Non-Combined"], + "stat_type": ["All Read Pairs", "Read Pairs With Valid Slices"], + }, + color_discrete_sequence=["#599AD3", "#9E66AB"], + ) + fig.update_layout( + legend_title_text="", + margin={"b": 10}, + ) + fig.update_yaxes(title="") + # fig.update_xaxes(matches=None, showticklabels=True) + fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1])) + fig.layout["xaxis"]["title"]["text"] = "Number of Slices" + fig.update_traces(marker_line_width=0) + fig.update_traces(width=0.5) + + try: + fig["layout"]["updatemenus"] = None + # fig["layout"]["updatemenus"][0].update(dict(y=1.2, pad={"b": 10, "t": 0, "l": 0}, x=0)) + fig["layout"]["sliders"][0]["pad"] = {"b": 10, "t": 25} + fig["layout"]["sliders"][0]["x"] = 0 + fig["layout"]["sliders"][0]["len"] = 1 + + except (KeyError, IndexError): # Might only have one sample + pass + + return fig + + +def plot_digestion_histogram(digestion_stats_histogram_path: os.PathLike): + + df = pd.read_csv(digestion_stats_histogram_path) + + df["filtered"] = df["filtered"].map({0: "Pre-filtering", 1: "Post-filtering"}) + df["read_type"] = df["read_type"].map( + {"flashed": "Combined Reads", "pe": "Non-Combined Reads"} + ) + + df = df.sort_values(["sample", "n_slices", "filtered", "read_type"]) + + fig = px.bar( + data_frame=df, + x="n_slices", + y="count", + color="filtered", + pattern_shape="read_type", + animation_frame="sample", + animation_group="n_slices", + barmode="group", + category_orders={"filtered": ["Pre-filtering", "Post-filtering"]}, + template="plotly_white", + range_x=[0, df["n_slices"].max()], + range_y=[0, df["count"].max()], + color_discrete_sequence=["#599AD3", "#9E66AB"], + ) + + fig.update_xaxes(title="") + fig.update_yaxes(title="") + fig.update_layout(legend_title_text="") + fig.update_xaxes(dtick=1) + + try: + fig["layout"]["updatemenus"] = None + # fig["layout"]["updatemenus"][0].update(dict(y=1, pad={"b": 10, "t": 0}, x=0)) + fig["layout"]["sliders"][0]["pad"] = {"r": 10, "b": 5, "t": 10} + fig["layout"]["sliders"][0]["x"] = 0 + fig["layout"]["sliders"][0]["len"] = 1 + except (KeyError, IndexError): + pass + + return fig + + +def format_alignment_filtering_read_stats(filtering_read_stats_path: os.PathLike): + df = pd.read_csv(filtering_read_stats_path) + df = ( + df.sort_values("stat", ascending=False) + .query('stat_type != "not-deduplicated"') + .replace("duplicate_filtered", "partial_duplicate_removal") + .replace("deduplicated", "full_PCR_duplicate_removal") + .assign( + stat_type=lambda df: df["stat_type"] + .str.replace("_", " ") + .str.title() + .str.replace("Pcr", "PCR"), + read_type=lambda df: df["read_type"] + .replace("flashed", "Combined") + .replace("pe", "Non-Combined"), + sample=lambda df: df["sample"].str.replace("_", " "), + ) + ) + df.loc[ + (df["stat_type"] == "Full PCR Duplicate Removal") + & (df["read_type"] == "Non-Combined"), + "stat", + ] = ( + df.loc[ + (df["stat_type"] == "Full PCR Duplicate Removal") + & (df["read_type"] == "Non-Combined"), + "stat", + ] + // 2 + ) + return df.sort_values( + ["sample", "read_type", "stat"], ascending=[True, True, False] + ) + + +def plot_alignment_filtering_read_summary(filtering_read_stats_path: os.PathLike): + + # breakpoint() + + df = format_alignment_filtering_read_stats(filtering_read_stats_path) + + fig = px.bar( + df, + x="stat", + y="stat_type", + color="read_type", + barmode="group", + animation_frame="sample", + animation_group="stat_type", + template="plotly_white", + category_orders={ + "stat_type": df["stat_type"].unique(), + "read_type": list(reversed(["Combined", "Non-Combined"])), + }, + range_x=[0, df["stat"].max()], + color_discrete_sequence=list(reversed(["#599AD3", "#9E66AB"])), + ) + + fig.update_xaxes(title="") + fig.update_yaxes(title="") + fig.update_layout(legend_title_text="", legend_traceorder="reversed") + + try: + fig["layout"]["updatemenus"] = None + # fig["layout"]["updatemenus"][0].update(dict(y=1.1, pad={"b": 5, "t": 0}, x=0)) + fig["layout"]["sliders"][0]["pad"] = {"r": 10, "b": 5, "t": 10} + fig["layout"]["sliders"][0]["x"] = 0 + fig["layout"]["sliders"][0]["len"] = 1 + except (KeyError, IndexError): + pass + + return fig + + +def plot_reporter_summary(reporter_stats_path: os.PathLike): + df = pd.read_csv(reporter_stats_path) + df = df.groupby(["sample", "viewpoint", "cis/trans"]).sum().reset_index() + df = df.replace("cis", "Cis").replace("trans", "Trans") + + fig = px.bar( + df.sort_values(["sample", "viewpoint"]), + x="viewpoint", + y="count", + color="cis/trans", + barmode="group", + animation_frame="sample", + range_y=[0, df["count"].max()], + template="plotly_white", + color_discrete_sequence=["#9CCB86", "#CF597E"], + ) + + fig.update_xaxes(title="") + fig.update_yaxes(title="") + fig.update_layout(legend_title_text="") + + try: + fig["layout"]["updatemenus"] = None + # fig["layout"]["updatemenus"][0].update( + # dict(y=1.2, pad={"l": 0, "b": 10, "t": 0}, x=0) + # ) + fig["layout"]["sliders"][0]["pad"] = {"r": 0, "b": 5, "t": 50} + fig["layout"]["sliders"][0]["x"] = 0 + fig["layout"]["sliders"][0]["len"] = 1 + except (KeyError, IndexError): + pass + + return fig + + +def format_run_stats_for_overall_summary(run_stats_path: os.PathLike): + + df = pd.read_csv(run_stats_path) + df = df.sort_values("stat", ascending=False) + + stat_type_mapping = { + "reads_total": "Total Reads", + "reads_unique": "PCR Duplicate Filtered (1st pass)", + "unfiltered": "Passed Trimming and Combining", + "filtered": "Passed Minimum Slice Length Filter", + "mapped": "Mapped to Reference genome", + "contains_single_capture": "Contains one Viewpoint Slice", + "contains_capture_and_reporter": "Contains one Viewpoint and at least one Reporter Slice", + "duplicate_filtered": "PCR Duplicate Filtered (2nd pass, partial)", + "deduplicated": "PCR Duplicate Filtered (final pass)", + } + + df = df.assign( + stat_type=lambda df: df["stat_type"].map(stat_type_mapping), + read_type=lambda df: df["read_type"] + .replace("flashed", "Combined") + .replace("pe", "Non-Combined"), + sample=lambda df: df["sample"].str.replace("_", " "), + ) + + df = df.sort_values("sample") + return df + + +def plot_overall_summary(run_stats_path: os.PathLike): + + df = format_run_stats_for_overall_summary(run_stats_path) + stat_type_order = ( + df.groupby(["sample", "stat_type", "read_type", "read_number"])["stat"] + .sum() + .sort_values(ascending=False) + .reset_index()["stat_type"] + .unique() + ) + + fig = px.bar( + df, + x="stat", + y="stat_type", + color="read_type", + animation_frame="sample", + animation_group="stat_type", + barmode="relative", + template="plotly_white", + category_orders={ + "sample": sorted(df["sample"].unique()), + "read_type": ["Combined", "Non-Combined"], + "stat_type": stat_type_order, + }, + color_discrete_sequence=["#599AD3", "#9E66AB"], + ) + + fig.update_xaxes(title="") + fig.update_yaxes(title="") + fig.update_layout(legend_title_text="") + fig.update_traces(marker_line_width=0) + + try: + fig["layout"]["updatemenus"] = None + # fig["layout"]["updatemenus"][0].update( + # dict(y=1.1, pad={"l": 0, "b": 5, "t": 0}, x=0) + # ) + fig["layout"]["sliders"][0]["pad"] = {"r": 0, "b": 5, "t": 10} + fig["layout"]["sliders"][0]["x"] = 0 + fig["layout"]["sliders"][0]["len"] = 1 + except (KeyError, IndexError): + pass + + return fig +``` + +```{python} +#| tags: [parameters] +fastq_deduplication_path = "" +fastq_digestion_hist_path = "" +fastq_digestion_read_path = "" +reporter_read_path = "" +reporter_cis_trans_path = "" +run_stats_path = "" +``` + +# FASTQ PCR Duplicate Removal: + +Fastq files (after partitioning) are examined for fragments (R1 + R2) that appear to be PCR duplicates. +Duplicates are identified by comparing the concatenated R1 and R2 sequences and filtering out exact matches. + +This is only the first pass of PCR duplicate removal as single base changes will be ignored. The aim here is to remove as many duplicate fragments as possible to reduce the amount of downstream processing required. + +Approximately 5-20% of fragments are typically removed by this step. + +```{python} +plot_deduplication_stats(fastq_deduplication_path) +``` + + + + +# Read pair combination statistics (FLASh): + +After the removal of adapters read pairs are combined (if any overlap exists) using `FLASh` to generate combined fragments (refered to as `flashed`). Non-combined read pairs that do not have a sufficient overlap (refered to as `paired-end` or `pe`) are maintained as read pairs in separate fastq files. + +```{python} +plot_flash_summary(run_stats_path) +``` + +# Fastq *in silico* digestion statistics (read pair level): + +Following read pair combination, the combined or non-combined fragments are examined for recognition sites of the restriction enzyme used for the assay. A valid digesion of a fragment (above the minimum threshold set) results in one or more restriction fragments, refered to as `slices`. + +Flashed read pairs are treated differently from paired-end read pairs as we expect to observe the ligation junction in the flashed fragment. Therefore, if no recognition sites are identified, the fragment is marked as invalid and is discarded. Non-combined (paired-end) reads are unlikely to contain the ligation junction and therefore if no restriction sites are identified, the individual read pairs are not discarded. + +## Digestion statistics summarised at the read pair level. + +The number of valid and invalid fragments are displayed. `slices` are considered invalid for the following reasons: + +* No restriction sites identified if the read pair is combined (`flashed`) +* The fragment is shorter than the minimum length specified (default 18 bp) + +```{python} +plot_digestion_read_summary(fastq_digestion_read_path) +``` + +## Histogram of the number of `slices` per fragment. + +This histogram displays the number of `slices` identified per fragment, split by flashed/pe status and pre/post filtering. + +```{python} +plot_digestion_histogram(fastq_digestion_hist_path) +``` + +# Alignment filtering statistics: + +After alignment to the reference genome and annotation with viewpoint probes, excluded regions and restriction fragments. Aligned `slices` are filtered and all fragments that do not contain one viewpoint slice and one or more reporter slice(s) (i.e. `slices` that are not viewpoint or appear in excluded regions) are removed. + +This chart shows the number of read pairs removed at each stage of the filtering, split by `flashed`/`pe` status. + +```{python} +plot_alignment_filtering_read_summary(reporter_read_path) +``` + +# Identified reporter statistics: + +`slices` from the same read fragment as a viewpoint `slices` are termed "reporters", these are used to determine interations with the viewpoint restriction fragment. + +This chart displays the number of `cis` (same chromosome as viewpoint) or `trans` (different chromosome to viewpoint) reporters identified, separated by viewpoint. + +```{python} +plot_reporter_summary(reporter_cis_trans_path) +``` + + +# Pipeline run statistics: + +This chart displays the combined statistics from the entire pipeline run summarised at the read pair level. + +```{python} +plot_overall_summary(run_stats_path) +``` diff --git a/capcruncher/pipeline/make_report.py b/capcruncher/pipeline/workflow/report/make_report.py similarity index 74% rename from capcruncher/pipeline/make_report.py rename to capcruncher/pipeline/workflow/report/make_report.py index 7d09f360..9ccc2140 100644 --- a/capcruncher/pipeline/make_report.py +++ b/capcruncher/pipeline/workflow/report/make_report.py @@ -1,11 +1,12 @@ -from audioop import reverse +# ruff: noqa: F821 + import os import pandas as pd -import numpy as np import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import yaml +import pathlib def plot_deduplication_stats(deduplication_summary_path: os.PathLike): @@ -273,7 +274,11 @@ def format_alignment_filtering_read_stats(filtering_read_stats_path: os.PathLike def plot_alignment_filtering_read_summary(filtering_read_stats_path: os.PathLike): + + # breakpoint() + df = format_alignment_filtering_read_stats(filtering_read_stats_path) + fig = px.bar( df, x="stat", @@ -361,7 +366,9 @@ def format_run_stats_for_overall_summary(run_stats_path: os.PathLike): df = df.assign( stat_type=lambda df: df["stat_type"].map(stat_type_mapping), - read_type=lambda df: df["read_type"].replace("flashed", "Combined").replace("pe", "Non-Combined"), + read_type=lambda df: df["read_type"] + .replace("flashed", "Combined") + .replace("pe", "Non-Combined"), sample=lambda df: df["sample"].str.replace("_", " "), ) @@ -416,119 +423,100 @@ def plot_overall_summary(run_stats_path: os.PathLike): return fig -def generate_report( - pipeline_statistics_path: os.PathLike, - pipeline_report_path: os.PathLike = "report.html", -): - - # Get paths - fastq_deduplication_path = os.path.join( - pipeline_statistics_path, "deduplication/deduplication.summary.csv" - ) - fastq_trimming_path = os.path.join( - pipeline_statistics_path, "trimming/trimming.summary.csv" - ) - fastq_digestion_hist_path = os.path.join( - pipeline_statistics_path, "digestion/digestion.histogram.csv" - ) - fastq_digestion_read_path = os.path.join( - pipeline_statistics_path, "digestion/digestion.reads.csv" - ) - reporter_read_path = os.path.join( - pipeline_statistics_path, "reporters/reporters.reads.csv" - ) - reporter_cis_trans_path = os.path.join( - pipeline_statistics_path, "reporters/reporters.reporters.csv" - ) - run_stats_path = os.path.join(pipeline_statistics_path, "run_statistics.csv") - - # # Extract HTML template - # dir_pipeline = os.path.dirname(os.path.abspath(__file__)) - # path_html_template = os.path.join(dir_pipeline, "report_template.html") - - html_header = """ - - - - - - - -

Run statistics

-

This report provides statistics for all major pre-processing and filtering steps performed by the pipeline. - All charts are interactive so hovering over areas of interest will provide additional information.

- """ - - html_footer = """ - """ - - section_template = """ - -

SECTION_NAME

-

SECTION_DESCRIPTION

- FIGURE_HTML - """ - # - - figures = dict( - deduplication=plot_deduplication_stats( - fastq_deduplication_path - ), # Deduplication - flashed=plot_flash_summary(run_stats_path), # Flashed - digestion_reads=plot_digestion_read_summary( - fastq_digestion_read_path - ), # Digestion reads - digestion_hist=plot_digestion_histogram( - fastq_digestion_hist_path - ), # Digestion histogram - alignment_filtering=plot_alignment_filtering_read_summary( - reporter_read_path - ), # Filtering - reporters=plot_reporter_summary(reporter_cis_trans_path), # Reporters - overall=plot_overall_summary(run_stats_path), # Overall - ) - - figure_name_to_title_mapping = dict( - deduplication="FASTQ PCR Duplicate Removal", - flashed="Read pair combination statistics (FLASh)", - digestion_reads="Fastq in silico digestion statistics (read pair level)", - digestion_hist="Fastq in silico digestion statistics (slice level)", - alignment_filtering="Alignment filtering statistics", - reporters="Identified reporter statistics", - overall="Pipeline run statistics", - ) - - report_text_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "report_text.yml" - ) - with open(report_text_path, "r") as r: - report_text = yaml.safe_load(r) - - with open(pipeline_report_path, "w") as report: - - report.write(html_header) - - for ii, (fig_name, fig) in enumerate(figures.items()): - - fig_html = fig.to_html( - full_html=False, - include_plotlyjs=True if ii == 0 else False, - auto_play=False, - ) - fig_title = figure_name_to_title_mapping[fig_name] - fig_text = report_text[fig_title] - - report.write( - section_template.replace("SECTION_NUMBER", str(ii)) - .replace("SECTION_NAME", fig_title) - .replace("SECTION_DESCRIPTION", fig_text) - .replace("FIGURE_HTML", fig_html) - ) +# Get paths +fastq_deduplication_path = snakemake.input.fastq_deduplication +fastq_digestion_hist_path = snakemake.input.digestion_histogram +fastq_digestion_read_path = snakemake.input.digestion_read +reporter_read_path = snakemake.input.reporters +reporter_cis_trans_path = snakemake.input.cis_and_trans_stats +run_stats_path = snakemake.input.read_level_stats + +# # Extract HTML template +# dir_pipeline = os.path.dirname(os.path.abspath(__file__)) +# path_html_template = os.path.join(dir_pipeline, "report_template.html") + +html_header = """ + + + + + + + +

Run statistics

+

This report provides statistics for all major pre-processing and filtering steps performed by the pipeline. + All charts are interactive so hovering over areas of interest will provide additional information.

+""" + +html_footer = """ + """ + +section_template = """ + +

SECTION_NAME

+

SECTION_DESCRIPTION

+FIGURE_HTML +""" +# + +figures = dict( + deduplication=plot_deduplication_stats(fastq_deduplication_path), # Deduplication + flashed=plot_flash_summary(run_stats_path), # Flashed + digestion_reads=plot_digestion_read_summary( + fastq_digestion_read_path + ), # Digestion reads + digestion_hist=plot_digestion_histogram( + fastq_digestion_hist_path + ), # Digestion histogram + alignment_filtering=plot_alignment_filtering_read_summary( + reporter_read_path + ), # Filtering + reporters=plot_reporter_summary(reporter_cis_trans_path), # Reporters + overall=plot_overall_summary(run_stats_path), # Overall +) + +figure_name_to_title_mapping = dict( + deduplication="FASTQ PCR Duplicate Removal", + flashed="Read pair combination statistics (FLASh)", + digestion_reads="Fastq in silico digestion statistics (read pair level)", + digestion_hist="Fastq in silico digestion statistics (slice level)", + alignment_filtering="Alignment filtering statistics", + reporters="Identified reporter statistics", + overall="Pipeline run statistics", +) + +report_text_path = ( + pathlib.Path(__file__).parent.parent / "data" / "report_text.yml" +).resolve() + +with open(report_text_path, "r") as r: + report_text = yaml.safe_load(r) + +with open(snakemake.output[0], "w") as report: + + report.write(html_header) + + for ii, (fig_name, fig) in enumerate(figures.items()): + + fig_html = fig.to_html( + full_html=False, + include_plotlyjs=True if ii == 0 else False, + auto_play=False, + ) + fig_title = figure_name_to_title_mapping[fig_name] + fig_text = report_text[fig_title] + + report.write( + section_template.replace("SECTION_NUMBER", str(ii)) + .replace("SECTION_NAME", fig_title) + .replace("SECTION_DESCRIPTION", fig_text) + .replace("FIGURE_HTML", fig_html) + ) - report.write(html_footer) + report.write(html_footer) diff --git a/capcruncher/pipeline/report_text.yml b/capcruncher/pipeline/workflow/report/report_text.yml similarity index 98% rename from capcruncher/pipeline/report_text.yml rename to capcruncher/pipeline/workflow/report/report_text.yml index 41141f94..eaf90b31 100644 --- a/capcruncher/pipeline/report_text.yml +++ b/capcruncher/pipeline/workflow/report/report_text.yml @@ -28,4 +28,4 @@ Identified reporter statistics:

This chart displays the number of cis (same chromosome as viewpoint) or trans (different chromosome to viewpoint) reporters identified, separated by viewpoint.

Pipeline run statistics: -

This chart displays the combined statistics from the entire pipeline run summarised at the read pair level.

\ No newline at end of file +

This chart displays the combined statistics from the entire pipeline run summarised at the read pair level.

diff --git a/capcruncher/pipeline/workflow/rules/align.smk b/capcruncher/pipeline/workflow/rules/align.smk new file mode 100644 index 00000000..dbd55839 --- /dev/null +++ b/capcruncher/pipeline/workflow/rules/align.smk @@ -0,0 +1,110 @@ +import capcruncher.pipeline.utils +from typing import Literal + + +def get_rebalanced_parts( + wildcards, combined: Literal["flashed", "pe"] = None, **kwargs +): + combined = combined or wildcards.combined + import pathlib + import re + + parts = dict() + outdirs = dict( + flashed=checkpoints.rebalance_partitions_combined.get( + **{**wildcards, **kwargs} + ).output[0], + pe=checkpoints.rebalance_partitions_pe.get(**{**wildcards, **kwargs}).output[0], + ) + + for combined_type in ["flashed", "pe"]: + fq_files = pathlib.Path(outdirs[combined_type]).glob("*.fastq.gz") + parts[combined_type] = list( + sorted( + set( + [ + int(re.search(r"part(\d+)", f.name).group(1)) + for f in fq_files + if re.search(r"part(\d+)", f.name) + ] + ) + ) + ) + + if combined == "flashed": + return parts["flashed"] + else: + return parts["pe"] + + +def get_rebalanced_bam(wildcards): + bam = [] + for combined_type in ["flashed", "pe"]: + for part in get_rebalanced_parts(wildcards, combined_type): + bam.append( + f"capcruncher_output/interim/aligned/{wildcards.sample}/{wildcards.sample}_part{part}_{combined_type}.sorted.bam" + ) + + return bam + + +rule align_bowtie2: + input: + fastq="capcruncher_output/interim/fastq/digested/{sample}/{sample}_part{part}_{combined}.fastq.gz", + output: + bam=temp( + "capcruncher_output/interim/aligned/{sample}/{sample}_part{part}_{combined,(flashed|pe)}.bam" + ), + resources: + mem_mb=4000, + params: + aligner=config["align"]["aligner"], + index_flag=config["align"].get("index_flag", ""), + indices=config["genome"]["aligner_index"], + options=config["align"].get("options", ""), + threads: 4 + log: + "capcruncher_output/logs/align/{sample}_{part}_{combined}.log", + shell: + """ + {params.aligner} {params.index_flag} {params.indices} {params.options} -p {threads} {input.fastq} 2> {log} | + samtools view -bS - > {output.bam} + """ + + +rule sort_bam_partitions: + input: + bam=rules.align_bowtie2.output.bam, + output: + bam=temp( + "capcruncher_output/interim/aligned/{sample}/{sample}_part{part}_{combined}.sorted.bam" + ), + threads: 4 + log: + "capcruncher_output/logs/align/{sample}_{part}_{combined}_sort.log", + shell: + """ + samtools sort -@ {threads} -o {output.bam} {input.bam} 2> {log} + """ + + +rule merge_bam_partitions: + input: + bam=get_rebalanced_bam, + output: + bam="capcruncher_output/results/{sample}/{sample}.bam", + shell: + """ + samtools merge -o {output.bam} {input.bam} + """ + + +rule index_bam: + input: + bam="capcruncher_output/results/{sample}/{sample}.bam", + output: + bam="capcruncher_output/results/{sample}/{sample}.bam.bai", + shell: + """ + samtools index {input.bam} + """ diff --git a/capcruncher/pipeline/workflow/rules/annotate.smk b/capcruncher/pipeline/workflow/rules/annotate.smk new file mode 100644 index 00000000..db1b2156 --- /dev/null +++ b/capcruncher/pipeline/workflow/rules/annotate.smk @@ -0,0 +1,57 @@ +import json +import pyranges as pr +import capcruncher.pipeline.utils + + +rule exclusions: + input: + viewpoints=config["analysis"]["viewpoints"], + output: + exclusions="capcruncher_output/interim/annotate/exclude.bed", + params: + genome=config["genome"]["chrom_sizes"], + exclusion_zone=config["analysis"]["reporter_exclusion_zone"], + shell: + """ + bedtools slop -i {input.viewpoints} -g {params.genome} -b {params.exclusion_zone} | + bedtools subtract -a - -b {input.viewpoints} > {output.exclusions} + """ + + +rule annotate: + input: + bam=rules.align_bowtie2.output.bam, + exclusions="capcruncher_output/interim/annotate/exclude.bed", + viewpoints=config["analysis"]["viewpoints"], + output: + annotated=temp( + "capcruncher_output/interim/annotate/{sample}/{sample}_part{part}_{combined}.parquet" + ), + params: + annotation_files_and_params=capcruncher.pipeline.utils.format_annotation_parameters( + workflow, config + ), + priority_chromosomes=capcruncher.pipeline.utils.format_priority_chromosome_list( + config + ), + prioritize_cis_slices="--prioritize-cis-slices" + if config["analysis_optional"].get("prioritize_cis_slices", "") + else "", + threads: 6 + resources: + mem_mb=lambda wildcards, attempt: 2000 * 2**attempt, + log: + "capcruncher_output/logs/annotate/{sample}/{sample}_part{part}_{combined}.log", + shell: + """ + capcruncher \ + alignments \ + annotate \ + {input.bam} \ + -o \ + {output.annotated} \ + {params.annotation_files_and_params} \ + {params.priority_chromosomes} \ + {params.prioritize_cis_slices} \ + -p {threads} > {log} 2>&1 + """ diff --git a/capcruncher/pipeline/workflow/rules/compare.smk b/capcruncher/pipeline/workflow/rules/compare.smk new file mode 100644 index 00000000..3b82535f --- /dev/null +++ b/capcruncher/pipeline/workflow/rules/compare.smk @@ -0,0 +1,146 @@ +import itertools +import capcruncher.pipeline.utils + + +rule union_bedgraph: + input: + expand( + "capcruncher_output/interim/pileups/bedgraphs/{sample}/{{norm}}/{sample}_{{viewpoint}}.bedgraph", + sample=SAMPLE_NAMES, + ), + output: + "capcruncher_output/results/comparisons/counts_per_viewpoint/{norm}/{viewpoint}.tsv", + params: + sample_names=" ".join(SAMPLE_NAMES), + shell: + """ + bedtools \ + unionbedg \ + -i {input} \ + -header \ + -names {params.sample_names} \ + > {output} + """ + + +rule compare_interactions: + input: + "capcruncher_output/results/comparisons/counts_per_viewpoint/norm/{viewpoint}.tsv", + output: + bedgraphs_summary=temp( + expand( + "capcruncher_output/interim/comparisons/summaries_and_subtractions/{group}.{method}-summary.{{viewpoint}}.bedgraph", + group=DESIGN["condition"].unique(), + method=SUMMARY_METHODS, + ) + ), + bedgraphs_compare=temp( + expand( + "capcruncher_output/interim/comparisons/summaries_and_subtractions/{comparison}.{method}-subtraction.{{viewpoint}}.bedgraph", + comparison=[ + f"{a}-{b}" + for a, b in itertools.permutations(DESIGN["condition"].unique(), 2) + ], + method=SUMMARY_METHODS, + ) + ), + params: + output_prefix=lambda wc, output: f"{pathlib.Path(output[0]).parent}/", + summary_methods=" ".join([f"-m {m}" for m in SUMMARY_METHODS]), + names=" ".join([f"-n {group}" for group in DESIGN["condition"].unique()]), + conditions=capcruncher.pipeline.utils.identify_columns_based_on_condition( + DESIGN + ), + resources: + mem_mb=5000, + log: + "capcruncher_output/logs/compare_interactions/{viewpoint}.log", + shell: + """ + capcruncher \ + interactions \ + compare \ + summarise \ + {input} \ + -o {params.output_prefix} \ + -f bedgraph \ + {params.summary_methods} \ + {params.names} \ + {params.conditions} \ + --subtraction \ + --suffix .{wildcards.viewpoint} \ + > {log} 2>&1 + """ + + +use rule bedgraph_to_bigwig as bigwig_compared with: + input: + bedgraph="capcruncher_output/interim/comparisons/summaries_and_subtractions/{comparison}.{method}-subtraction.{viewpoint}.bedgraph", + output: + bigwig="capcruncher_output/results/comparisons/bigwigs/{comparison}.{method}-subtraction.{viewpoint}.bigWig", + params: + chrom_sizes=config["genome"]["chrom_sizes"], + wildcard_constraints: + comparison=f"[A-Za-z0-9_\.]+-[A-Za-z0-9_\.]+", + log: + "capcruncher_output/logs/bedgraph_to_bigwig/{comparison}.{method}-subtraction.{viewpoint}.log", + + +use rule bedgraph_to_bigwig as bigwig_summarised with: + input: + bedgraph="capcruncher_output/interim/comparisons/summaries_and_subtractions/{group}.{method}-summary.{viewpoint}.bedgraph", + output: + bigwig="capcruncher_output/results/comparisons/bigwigs/{group}.{method}-summary.{viewpoint}.bigWig", + params: + chrom_sizes=config["genome"]["chrom_sizes"], + wildcard_constraints: + comparison=f"[A-Za-z0-9_\.]+-[A-Za-z0-9_\.]+", + log: + "capcruncher_output/logs/bedgraph_to_bigwig/{group}.{method}-summary.{viewpoint}.log", + + +rule save_design: + output: + "capcruncher_output/results/design_matrix.tsv", + container: + None + run: + DESIGN.to_csv(output[0], sep="\t", index=False) + + +rule differential_interactions: + input: + counts=expand( + "capcruncher_output/results/{sample}/{sample}.hdf5", sample=SAMPLE_NAMES + ), + design_matrix="capcruncher_output/results/design_matrix.tsv", + output: + directory("capcruncher_output/results/differential/{viewpoint}"), + params: + output_prefix=lambda wc, output: output[0], + viewpoint="{viewpoint}", + contrast=config["differential"]["contrast"], + viewpoint_distance=config["differential"]["distance"], + resources: + mem_mb=5000, + log: + "capcruncher_output/logs/differential_interactions/{viewpoint}.log", + shell: + """ + capcruncher \ + interactions \ + compare \ + differential \ + {input.counts} \ + --design-matrix \ + {input.design_matrix} \ + -o {params.output_prefix} \ + -v {params.viewpoint} \ + -c {params.contrast} \ + --viewpoint-distance {params.viewpoint_distance} \ + > {log} 2>&1 || + + echo "No differential interactions found for {params.viewpoint}" + mkdir -p {output} + + """ diff --git a/capcruncher/pipeline/workflow/rules/digest.smk b/capcruncher/pipeline/workflow/rules/digest.smk new file mode 100644 index 00000000..0ae13820 --- /dev/null +++ b/capcruncher/pipeline/workflow/rules/digest.smk @@ -0,0 +1,19 @@ +rule digest_genome: + input: + fasta=config["genome"]["fasta"], + output: + bed="capcruncher_output/resources/restriction_fragments/genome.digest.bed.gz", + stats="capcruncher_output/interim/statistics/digest_genome/genome_digestion_statistics.txt", + log: + "capcruncher_output/resources/restriction_fragments/genome.digest.log", + params: + enzyme_or_site=config["analysis"]["restriction_enzyme"], + threads: 4 + resources: + mem_mb=2000, + shell: + """ + capcruncher genome digest {input.fasta} -r {params.enzyme_or_site} -o {output.bed}.tmp --sort -l {output.stats} > {log} 2>&1 && + pigz -p {threads} {output.bed}.tmp -c > {output.bed} 2> {log} + rm {output.bed}.tmp + """ diff --git a/capcruncher/pipeline/workflow/rules/fastq.smk b/capcruncher/pipeline/workflow/rules/fastq.smk new file mode 100644 index 00000000..1af3e5d3 --- /dev/null +++ b/capcruncher/pipeline/workflow/rules/fastq.smk @@ -0,0 +1,565 @@ +import os +import pathlib +import json +import re +from typing import Literal +import capcruncher.pipeline.utils + + +def get_split_1_parts(wildcards): + + import pathlib + import json + import re + + outdir = checkpoints.split.get(**wildcards).output[0] + fq_files = pathlib.Path(outdir).glob("*.fastq.gz") + parts = sorted( + set( + [ + int(re.search(r"part(\d+)", f.name).group(1)) + for f in fq_files + if re.search(r"part(\d+)", f.name) + ] + ) + ) + + return parts + + +def get_pickles(wc): + return expand( + "capcruncher_output/interim/fastq/deduplicated/{{sample}}/{{sample}}_{part}.pkl", + part=get_split_1_parts(wc), + ) + + +def get_fastq_split_1(wildcards): + return { + f"fq{read}": expand( + "capcruncher_output/interim/fastq/split/{{sample}}/{{sample}}_part{part}_{read}.fastq.gz", + part=get_split_1_parts(wildcards), + read=[read], + ) + for read in ["1", "2"] + } + + +def get_deduplicated_fastq_pair(wildcards): + import pathlib + + input_dir = checkpoints.deduplication.get(**wildcards).output[0] + + fq = { + f"fq{read}": f"{input_dir.rstrip('/')}/{wildcards.sample}_part{wildcards.part}_{read}.fastq.gz" + for read in ["1", "2"] + } + + if pathlib.Path(fq["fq1"]).exists() and pathlib.Path(fq["fq2"]).exists(): + return fq + else: + return {"fq1": [], "fq2": []} + + +def get_flashed_fastq(wildcards): + import pathlib + + fq = [ + f"capcruncher_output/interim/fastq/flashed/{wildcards.sample}/{wildcards.sample}_part{part}.extendedFrags.fastq.gz" + for part in get_split_1_parts(wildcards) + ] + return fq + + +def get_pe_fastq(wildcards): + fq = [ + f"capcruncher_output/interim/fastq/flashed/{wildcards.sample}/{wildcards.sample}_part{part}.notCombined_{read}.fastq.gz" + for part in get_split_1_parts(wildcards) + for read in ["1", "2"] + ] + return fq + + +def get_rebalanced_parts(wc, combined: Literal["flashed", "pe"], sample: str = None): + if not sample: + sample = wc.sample + + if combined == "flashed": + checkpoint_output = checkpoints.rebalance_partitions_combined.get( + sample=sample + ).output[0] + parts = glob_wildcards( + "capcruncher_output/interim/fastq/rebalanced/{sample}/flashed/{sample_name}_part{part}_flashed_1.fastq.gz" + ).part + elif combined == "pe": + checkpoint_output = checkpoints.rebalance_partitions_pe.get( + sample=sample + ).output[0] + parts = glob_wildcards( + "capcruncher_output/interim/fastq/rebalanced/{sample}/pe/{sample_name}_part{part}_pe_1.fastq.gz" + ).part + + else: + raise ValueError(f"Unknown combined type {combined}") + + return set(parts) + + +def get_rebalanced_fastq_combined(wc): + checkpoint_output = checkpoints.rebalance_partitions_combined.get(**wc).output[0] + return f"capcruncher_output/interim/fastq/rebalanced/{wc.sample}/flashed/{wc.sample}_part{wc.part}_flashed_1.fastq.gz" + + +def get_rebalanced_fastq_pe(wc): + checkpoint_output = checkpoints.rebalance_partitions_pe.get( + **wc, + ).output[0] + return { + "pe1": f"capcruncher_output/interim/fastq/rebalanced/{wc.sample}/pe/{wc.sample}_part{wc.part}_pe_1.fastq.gz", + "pe2": f"capcruncher_output/interim/fastq/rebalanced/{wc.sample}/pe/{wc.sample}_part{wc.part}_pe_2.fastq.gz", + } + + +def get_deduplicated_fastq(wc): + checkpoint_output = checkpoints.deduplication.get(sample=wc.sample).output[0] + return { + "fq1": f"capcruncher_output/interim/fastq/deduplicated/{wc.sample}/{wc.sample}_part{wc.part}_1.fastq.gz", + "fq2": f"capcruncher_output/interim/fastq/deduplicated/{wc.sample}/{wc.sample}_part{wc.part}_2.fastq.gz", + } + + +def separate_pe_fastq(wc): + return { + 1: expand( + "capcruncher_output/interim/fastq/flashed/{sample}/{sample}_part{part}.notCombined_{read}.fastq.gz", + sample=wc.sample, + part=get_split_1_parts(wc), + read=["1"], + ), + 2: expand( + "capcruncher_output/interim/fastq/flashed/{sample}/{sample}_part{part}.notCombined_{read}.fastq.gz", + sample=wc.sample, + part=get_split_1_parts(wc), + read=["2"], + ), + } + + +rule fastq_rename: + input: + fq1=lambda wc: FASTQ_SAMPLES.translation[f"{wc.sample}_1.fastq.gz"], + fq2=lambda wc: FASTQ_SAMPLES.translation[f"{wc.sample}_2.fastq.gz"], + output: + fq1="capcruncher_output/interim/fastq/{sample}_1.fastq.gz", + fq2="capcruncher_output/interim/fastq/{sample}_2.fastq.gz", + log: + "capcruncher_output/logs/fastq_rename/{sample}.log", + shell: + """ + ln -s $(realpath {input.fq1}) {output.fq1} && + ln -s $(realpath {input.fq2}) {output.fq2} + """ + + +checkpoint split: + input: + fq1=rules.fastq_rename.output.fq1, + fq2=rules.fastq_rename.output.fq2, + output: + directory("capcruncher_output/interim/fastq/split/{sample}"), + threads: 4 + resources: + mem_mb=1000, + time="0-03:00:00", + params: + prefix="capcruncher_output/interim/fastq/split/{sample}/{sample}", + n_reads=str(config["split"].get("n_reads", 1e6)), + log: + "capcruncher_output/logs/split/{sample}.log", + shell: + """ + mkdir {output} && \ + capcruncher \ + fastq \ + split \ + {input.fq1} \ + {input.fq2} \ + -m \ + unix \ + -o \ + {params.prefix} \ + -n \ + {params.n_reads} \ + --gzip \ + -p \ + {threads} \ + > {log} 2>&1 + """ + + +if not CAPCRUNCHER_TOOLS: + + rule deduplication_parse: + input: + fq1="capcruncher_output/interim/fastq/split/{sample}/{sample}_part{part}_1.fastq.gz", + fq2="capcruncher_output/interim/fastq/split/{sample}/{sample}_part{part}_2.fastq.gz", + output: + temp( + "capcruncher_output/interim/fastq/deduplicated/{sample}/{sample}_{part}.pkl" + ), + resources: + mem_mb=2000, + log: + "capcruncher_output/logs/deduplication_fastq/parse/{sample}_part{part}.log", + shell: + """ + capcruncher \ + fastq \ + deduplicate \ + parse \ + {input.fq1} \ + {input.fq2} \ + -o \ + {output} \ + > {log} 2>&1 + """ + + rule deduplication_identify: + input: + hashed_reads=get_pickles, + output: + temp("capcruncher_output/interim/fastq/deduplicated/{sample}/{sample}.pkl"), + log: + "capcruncher_output/logs/deduplication_fastq/identify/{sample}.log", + resources: + mem_mb=5000, + threads: 3 + shell: + """ + capcruncher \ + fastq \ + deduplicate \ + identify \ + {input.hashed_reads} \ + -o \ + {output} + > {log} 2>&1 + """ + + rule deduplication_remove: + input: + fq1="capcruncher_output/interim/fastq/split/{sample}/{sample}_part{part}_1.fastq.gz", + fq2="capcruncher_output/interim/fastq/split/{sample}/{sample}_part{part}_2.fastq.gz", + ids_duplicated="capcruncher_output/interim/fastq/deduplicated/{sample}/{sample}.pkl", + output: + fq1=temp( + "capcruncher_output/interim/fastq/deduplicated/{sample}/{sample}_part{part}_1.fastq.gz" + ), + fq2=temp( + "capcruncher_output/interim/fastq/deduplicated/{sample}/{sample}_part{part}_2.fastq.gz" + ), + stats=temp( + "capcruncher_output/interim/statistics/deduplication/data/{sample}_part{part}.deduplication.csv" + ), + params: + prefix_fastq="capcruncher_output/interim/fastq/deduplicated/{sample}/{sample}_part{part}", + prefix_stats="capcruncher_output/interim/statistics/deduplication/data/{sample}_part{part}", + resources: + mem_mb=2000, + log: + "capcruncher_output/logs/deduplication_fastq/remove/{sample}_part{part}.log", + threads: 4 + shell: + """ + capcruncher \ + fastq \ + deduplicate \ + remove \ + {input.fq1} \ + {input.fq2} \ + -d \ + {input.ids_duplicated} \ + --sample-name \ + {wildcards.sample} \ + --stats-prefix \ + {params.prefix_stats} \ + -o \ + {params.prefix_fastq} \ + -p \ + {threads} \ + --hash-read-name \ + --gzip \ + > {log} 2>&1 + """ + + rule trim: + input: + fq1="capcruncher_output/interim/fastq/deduplicated/{sample}/{sample}_part{part}_1.fastq.gz", + fq2="capcruncher_output/interim/fastq/deduplicated/{sample}/{sample}_part{part}_2.fastq.gz", + output: + trimmed1="capcruncher_output/interim/fastq/trimmed/{sample}/{sample}_part{part}_1.fastq.gz", + trimmed2="capcruncher_output/interim/fastq/trimmed/{sample}/{sample}_part{part}_2.fastq.gz", + params: + outdir="capcruncher_output/interim/fastq/trimmed/{sample}/", + threads: 4 + resources: + mem_mb=2000, + log: + "capcruncher_output/logs/trimming/{sample}_{part}.log", + shell: + """ + trim_galore --cores {threads} --trim-n --paired --output_dir {params.outdir} {input.fq1} {input.fq2} >> {log} 2>&1 && + mv {params.outdir}/{wildcards.sample}_part{wildcards.part}_1_val_1.fq.gz {output.trimmed1} && + mv {params.outdir}/{wildcards.sample}_part{wildcards.part}_2_val_2.fq.gz {output.trimmed2} + """ + +else: + + checkpoint deduplication: + input: + unpack(get_fastq_split_1), + output: + fastq_dir=directory( + "capcruncher_output/interim/fastq/deduplicated/{sample}/" + ), + stats="capcruncher_output/interim/statistics/deduplication/data/{sample}.deduplication.csv", + params: + prefix_fastq="capcruncher_output/interim/fastq/deduplicated/{sample}/", + prefix_stats="capcruncher_output/interim/statistics/deduplication/data/{sample}/", + log: + "capcruncher_output/logs/deduplication_fastq/{sample}.log", + threads: workflow.cores * 0.5 + resources: + mem_mb=lambda wildcards, attempt: 2000 * 2**attempt, + shell: + """ + mkdir -p {params.prefix_stats} && + capcruncher-tools fastq-deduplicate -1 {input.fq1} -2 {input.fq2} -o {params.prefix_fastq} --statistics {output.stats} --sample-name {wildcards.sample} > {log} 2>&1 + """ + + rule trim: + input: + unpack(get_deduplicated_fastq_pair), + output: + trimmed1=temp( + "capcruncher_output/interim/fastq/trimmed/{sample}/{sample}_part{part}_1.fastq.gz" + ), + trimmed2=temp( + "capcruncher_output/interim/fastq/trimmed/{sample}/{sample}_part{part}_2.fastq.gz" + ), + params: + outdir="capcruncher_output/interim/fastq/trimmed/{sample}/", + threads: 4 + resources: + mem_mb=2000, + log: + "capcruncher_output/logs/trimming/{sample}_{part}.log", + shell: + """ + trim_galore --cores {threads} --trim-n --paired --output_dir {params.outdir} {input.fq1} {input.fq2} >> {log} 2>&1 && + mv {params.outdir}/{wildcards.sample}_part{wildcards.part}_1_val_1.fq.gz {output.trimmed1} && + mv {params.outdir}/{wildcards.sample}_part{wildcards.part}_2_val_2.fq.gz {output.trimmed2} + """ + + +rule flash: + input: + fq1="capcruncher_output/interim/fastq/trimmed/{sample}/{sample}_part{part}_1.fastq.gz", + fq2="capcruncher_output/interim/fastq/trimmed/{sample}/{sample}_part{part}_2.fastq.gz", + output: + flashed="capcruncher_output/interim/fastq/flashed/{sample}/{sample}_part{part}.extendedFrags.fastq.gz", + pe1="capcruncher_output/interim/fastq/flashed/{sample}/{sample}_part{part}.notCombined_1.fastq.gz", + pe2="capcruncher_output/interim/fastq/flashed/{sample}/{sample}_part{part}.notCombined_2.fastq.gz", + hist=temp( + "capcruncher_output/interim/fastq/flashed/{sample}/{sample}_part{part}.hist" + ), + histogram=temp( + "capcruncher_output/interim/fastq/flashed/{sample}/{sample}_part{part}.histogram" + ), + params: + outdir="capcruncher_output/interim/fastq/flashed/{sample}/{sample}_part{part}", + threads: 4 + resources: + mem_mb=1000, + log: + "capcruncher_output/logs/flash/{sample}_{part}.log", + shell: + """ + flash {input.fq1} {input.fq2} -o {params.outdir} -t {threads} -z --compress-prog-args pigz > {log} 2>&1 + """ + + +checkpoint rebalance_partitions_combined: + input: + flashed=lambda wc: get_flashed_fastq(wc), + output: + directory("capcruncher_output/interim/fastq/rebalanced/{sample}/flashed/"), + touch( + "capcruncher_output/interim/fastq/rebalanced/{sample}/flashed/.complete.sentinel" + ), + params: + prefix=lambda wildcards, output: pathlib.Path(output[0]) / wildcards.sample, + suffix=lambda wc: f"_flashed", + fq=lambda wc: ",".join(get_flashed_fastq(wc)), + n_reads=str(config["split"].get("n_reads", 1e6)), + log: + "capcruncher_output/logs/rebalance_partitions/{sample}_flashed.log", + threads: 4 + resources: + mem_mb=1000, + shell: + """ + mkdir -p {output[0]} && + capcruncher \ + fastq \ + split \ + {params.fq} \ + -m \ + unix \ + -o \ + {params.prefix} \ + -n \ + {params.n_reads} \ + --gzip \ + -p \ + {threads} \ + --suffix \ + {params.suffix} \ + > {log} 2>&1 && + touch {output[1]} + """ + + +checkpoint rebalance_partitions_pe: + input: + fq=get_pe_fastq, + output: + directory("capcruncher_output/interim/fastq/rebalanced/{sample}/pe"), + touch( + "capcruncher_output/interim/fastq/rebalanced/{sample}/pe/.complete.sentinel" + ), + params: + prefix=lambda wildcards, output: pathlib.Path(output[0]) / wildcards.sample, + suffix=lambda wc: f"_pe", + n_reads=str((config["split"].get("n_reads", 1e6) // 2)), + fq1=lambda wc: ",".join(separate_pe_fastq(wc)[1]), + fq2=lambda wc: ",".join(separate_pe_fastq(wc)[2]), + log: + "capcruncher_output/logs/rebalance_partitions/{sample}_pe.log", + threads: 4 + resources: + mem_mb=1000, + shell: + """ + mkdir -p {output[0]} && + capcruncher \ + fastq \ + split \ + {params.fq1} \ + {params.fq2} \ + -m \ + unix \ + -o \ + {params.prefix} \ + -n \ + {params.n_reads} \ + --gzip \ + -p \ + {threads} \ + --suffix \ + {params.suffix} \ + > {log} 2>&1 && + touch {output[1]} + """ + + +rule digest_flashed_combined: + input: + flashed="capcruncher_output/interim/fastq/rebalanced/{sample}/flashed/{sample}_part{part}_flashed_1.fastq.gz", + sentinel="capcruncher_output/interim/fastq/rebalanced/{sample}/flashed/.complete.sentinel", + output: + digested=temp( + "capcruncher_output/interim/fastq/digested/{sample}/{sample}_part{part}_flashed.fastq.gz" + ), + stats_read="capcruncher_output/interim/statistics/digestion/data/{sample}_part{part}_flashed.digestion.read.summary.csv", + stats_unfiltered="capcruncher_output/interim/statistics/digestion/data/{sample}_part{part}_flashed.digestion.unfiltered.histogram.csv", + stats_filtered="capcruncher_output/interim/statistics/digestion/data/{sample}_part{part}_flashed.digestion.filtered.histogram.csv", + params: + prefix_stats="capcruncher_output/interim/statistics/digestion/data/{sample}_part{part}_flashed", + restriction_site=config["analysis"]["restriction_enzyme"], + threads: 4 + resources: + mem_mb=2000, + log: + "capcruncher_output/logs/digestion/{sample}_{part}.log", + shell: + """ + capcruncher \ + fastq \ + digest \ + {input.flashed} \ + -o \ + {output.digested} \ + -p \ + {threads} \ + -m \ + flashed \ + -r \ + {params.restriction_site} \ + --minimum_slice_length \ + 18 \ + --stats-prefix \ + {params.prefix_stats} \ + --sample-name \ + {wildcards.sample} \ + > {log} 2>&1 + """ + + +rule digest_flashed_pe: + input: + pe1="capcruncher_output/interim/fastq/rebalanced/{sample}/pe/{sample}_part{part}_pe_1.fastq.gz", + pe2="capcruncher_output/interim/fastq/rebalanced/{sample}/pe/{sample}_part{part}_pe_2.fastq.gz", + sentinel="capcruncher_output/interim/fastq/rebalanced/{sample}/pe/.complete.sentinel", + output: + digested=temp( + "capcruncher_output/interim/fastq/digested/{sample}/{sample}_part{part}_pe.fastq.gz" + ), + stats_read="capcruncher_output/interim/statistics/digestion/data/{sample}_part{part}_pe.digestion.read.summary.csv", + stats_unfiltered="capcruncher_output/interim/statistics/digestion/data/{sample}_part{part}_pe.digestion.unfiltered.histogram.csv", + stats_filtered="capcruncher_output/interim/statistics/digestion/data/{sample}_part{part}_pe.digestion.filtered.histogram.csv", + params: + prefix_stats="capcruncher_output/interim/statistics/digestion/data/{sample}_part{part}_pe", + restriction_site=config["analysis"]["restriction_enzyme"], + threads: 4 + resources: + mem_mb=2000, + log: + "capcruncher_output/logs/digestion/{sample}_{part}.log", + shell: + """ + capcruncher \ + fastq \ + digest \ + {input.pe1} \ + {input.pe2} \ + -o \ + {output.digested} \ + -p \ + {threads} \ + -m \ + pe \ + -r \ + {params.restriction_site} \ + --minimum_slice_length \ + 18 \ + --stats-prefix \ + {params.prefix_stats} \ + --sample-name \ + {wildcards.sample} \ + > {log} 2>&1 + """ + + +localrules: + fastq_rename, diff --git a/capcruncher/pipeline/workflow/rules/filter.smk b/capcruncher/pipeline/workflow/rules/filter.smk new file mode 100644 index 00000000..f5cb94dd --- /dev/null +++ b/capcruncher/pipeline/workflow/rules/filter.smk @@ -0,0 +1,147 @@ +import capcruncher.pipeline.utils + + +def get_filtered_slices(wildcards): + slices = dict() + for combined_type in ["flashed", "pe"]: + parts = get_rebalanced_parts(wildcards, combined=combined_type) + slices[combined_type] = [ + f"capcruncher_output/interim/filtering/initial/{wildcards.sample}/{wildcards.sample}_part{part}_{combined_type}.slices.parquet" + for part in parts + ] + return slices + + +rule filter_alignments: + input: + bam=rules.align_bowtie2.output.bam, + annotations=rules.annotate.output.annotated, + output: + filtered_slices=temp( + "capcruncher_output/interim/filtering/initial/{sample}/{sample}_part{part}_{combined}.slices.parquet" + ), + stats_read="capcruncher_output/interim/statistics/filtering/data/{sample}_part{part}_{combined}.read.stats.csv", + stats_slice="capcruncher_output/interim/statistics/filtering/data/{sample}_part{part}_{combined}.slice.stats.csv", + params: + analysis_method=config["analysis"]["method"], + sample_name=lambda wildcards, output: wildcards.sample, + output_prefix=lambda wildcards, output: output.filtered_slices.replace( + ".slices.parquet", "" + ), + stats_prefix=lambda wildcards, output: output.stats_read.replace( + ".read.stats.csv", "" + ), + read_type=lambda wildcards, output: wildcards.combined, + custom_filtering=capcruncher.pipeline.utils.validate_custom_filtering(config), + resources: + mem_mb=5000, + log: + "capcruncher_output/interim/statistics/filtering/logs/{sample}_part{part}_{combined}.log", + shell: + """ + capcruncher \ + alignments \ + filter \ + {params.analysis_method} \ + -b {input.bam} \ + -a {input.annotations} \ + -o {params.output_prefix} \ + --stats-prefix {params.stats_prefix} \ + --sample-name {params.sample_name} \ + --read-type {params.read_type} \ + --no-cis-and-trans-stats \ + --no-fragments \ + {params.custom_filtering} > {log} 2>&1 + """ + + +rule split_flashed_and_pe_datasets: + input: + unpack(get_filtered_slices), + output: + slices_flashed=temp( + directory( + "capcruncher_output/interim/filtering/repartitioned/{sample}/flashed/" + ) + ), + slices_pe=temp( + directory( + "capcruncher_output/interim/filtering/repartitioned/{sample}/pe/" + ) + ), + shell: + """ + mkdir -p {output.slices_flashed} + mkdir -p {output.slices_pe} + mv {input.flashed} {output.slices_flashed} + mv {input.pe} {output.slices_pe} + """ + + +rule remove_duplicate_coordinates: + input: + slices_directory="capcruncher_output/interim/filtering/repartitioned/{sample}/{combined}/", + output: + slices=temp( + directory( + "capcruncher_output/interim/filtering/deduplicated/{sample}/{combined}" + ) + ), + stats_read=temp( + "capcruncher_output/interim/statistics/deduplication_by_coordinate/data/{sample}_{combined}.read.stats.csv" + ), + params: + sample_name=lambda wildcards, output: wildcards.sample, + stats_prefix=lambda wildcards, output: output.stats_read.replace( + ".read.stats.csv", "" + ), + read_type=lambda wildcards, output: wildcards.combined, + resources: + mem_mb=lambda wc, attempt: 3000 * 2**attempt, + threads: 12 + log: + "capcruncher_output/logs/remove_duplicate_coordinates/{sample}_{combined}.log", + script: + "../scripts/remove_duplicate_coordinates.py" + + +rule combine_flashed_and_pe_post_deduplication: + input: + slices=expand( + "capcruncher_output/interim/filtering/deduplicated/{{sample}}/{combined}", + combined=["flashed", "pe"], + ), + output: + slices=directory("capcruncher_output/results/{sample}/{sample}.parquet"), + shell: + """ + mkdir -p {output.slices} + mv {input.slices[0]}/*.parquet {output.slices} || mkdir -p {output.slices} + mv {input.slices[1]}/*.parquet {output.slices} || mkdir -p {output.slices} + """ + + +rule cis_and_trans_stats: + input: + slices="capcruncher_output/results/{sample}/{sample}.parquet", + output: + stats=temp( + "capcruncher_output/interim/statistics/cis_and_trans_reporters/data/{sample}.reporter.stats.csv" + ), + params: + sample_name=lambda wildcards, output: wildcards.sample, + analysis_method=config["analysis"]["method"], + resources: + mem_mb=3000, + log: + "capcruncher_output/logs/cis_and_trans_stats/{sample}.log", + shell: + """ + capcruncher \ + utilities \ + cis-and-trans-stats \ + {input.slices} \ + --assay {params.analysis_method} \ + --sample-name {params.sample_name} \ + -o {output.stats} \ + """ diff --git a/capcruncher/pipeline/workflow/rules/pileup.smk b/capcruncher/pipeline/workflow/rules/pileup.smk new file mode 100644 index 00000000..f5dcc03c --- /dev/null +++ b/capcruncher/pipeline/workflow/rules/pileup.smk @@ -0,0 +1,183 @@ +import capcruncher.pipeline.utils + +if CAPCRUNCHER_TOOLS: + + rule count: + input: + slices=rules.combine_flashed_and_pe_post_deduplication.output.slices, + restriction_fragment_map=rules.digest_genome.output.bed, + viewpoints=VIEWPOINTS, + output: + temp( + "capcruncher_output/pileups/counts_by_restriction_fragment/{sample}.hdf5" + ), + log: + "capcruncher_output/logs/counts/{sample}.log", + threads: 8 + resources: + mem_mb=lambda wc, attempt: 3000 * 2**attempt, + params: + outdir="capcruncher_output/pileups/counts_by_restriction_fragment", + shell: + """ + mkdir -p {params.outdir} && \ + capcruncher-tools \ + count \ + {input.slices} \ + -o {output} \ + -f {input.restriction_fragment_map} \ + -v {input.viewpoints} \ + -p {threads} \ + > {log} 2>&1 + """ + +else: + + rule count: + input: + slices=rules.combine_flashed_and_pe_post_deduplication.output.slices, + restriction_fragment_map=rules.digest_genome.output.bed, + viewpoints=VIEWPOINTS, + output: + temp( + "capcruncher_output/pileups/counts_by_restriction_fragment/{sample}.hdf5" + ), + log: + "capcruncher_output/logs/counts/{sample}.log", + threads: 8 + resources: + mem_mb=lambda wc, attempt: 3000 * 2**attempt, + params: + outdir="capcruncher_output/pileups/counts_by_restriction_fragment", + shell: + """ + mkdir -p {params.outdir} && \ + capcruncher \ + interactions \ + count \ + {input.slices} \ + -o {output} \ + -f {input.restriction_fragment_map} \ + -v {input.viewpoints} \ + --cooler-output \ + -p {threads} \ + > {log} 2>&1 + """ + + +rule bin_counts: + input: + "capcruncher_output/pileups/counts_by_restriction_fragment/{sample}.hdf5", + output: + temp("capcruncher_output/pileups/counts_by_genomic_bin/{sample}.hdf5"), + params: + bin_size=[f"-b {b}" for b in BIN_SIZES], + log: + "capcruncher_output/logs/bin_counts/{sample}.log", + threads: 4 + resources: + mem_mb=lambda wc, attempt: 3000 * 2**attempt, + shell: + """ + capcruncher \ + interactions \ + fragments-to-bins \ + {input} \ + -o {output} \ + {params.bin_size} \ + -p {threads} \ + > {log} 2>&1 + """ + + +rule merge_counts: + input: + lambda wc: capcruncher.pipeline.utils.get_count_files(wc, PERFORM_BINNING), + output: + "capcruncher_output/results/{sample}/{sample}.hdf5", + log: + "capcruncher_output/logs/merge_counts/{sample}.log", + threads: 1 + shell: + """ + capcruncher \ + interactions \ + merge \ + {input} \ + -o {output} \ + > {log} 2>&1 + """ + + +rule bedgraph_raw: + input: + cooler=rules.merge_counts.output, + output: + bedgraph=temp( + "capcruncher_output/interim/pileups/bedgraphs/{sample}/raw/{sample}_{viewpoint}.bedgraph" + ), + log: + "capcruncher_output/logs/bedgraph_raw/{sample}_{viewpoint}.log", + params: + output_prefix=lambda wc, output: pathlib.Path(output.bedgraph).parent + / f"{wc.sample}", + viewpoint=lambda wc, output: wc.viewpoint, + shell: + """ + capcruncher \ + interactions \ + pileup \ + {input.cooler} \ + -o {params.output_prefix} \ + -n {params.viewpoint} \ + --normalisation raw \ + > {log} 2>&1 + """ + + +rule bedgraph_normalised: + input: + cooler=rules.merge_counts.output, + output: + bedgraph=temp( + "capcruncher_output/interim/pileups/bedgraphs/{sample}/norm/{sample}_{viewpoint}.bedgraph" + ), + log: + "capcruncher_output/logs/bedgraph_norm/{sample}_{viewpoint}.log", + params: + output_prefix=lambda wc, output: pathlib.Path(output.bedgraph).parent + / f"{wc.sample}", + viewpoint=lambda wc, output: wc.viewpoint, + normalisation=lambda wc: capcruncher.pipeline.utils.get_normalisation_from_config( + wc, config + ), + scale_factor=config["normalisation"].get("scale_factor", int(1e6)), + shell: + """ + capcruncher \ + interactions \ + pileup \ + {input.cooler} \ + -o {params.output_prefix} \ + -n {params.viewpoint} \ + {params.normalisation} \ + --scale-factor {params.scale_factor} \ + > {log} 2>&1 + """ + + +rule bedgraph_to_bigwig: + input: + bedgraph="capcruncher_output/interim/pileups/bedgraphs/{sample}/{norm}/{sample}_{viewpoint}.bedgraph", + output: + bigwig="capcruncher_output/results/{sample}/bigwigs/{norm}/{sample}_{viewpoint}.bigWig", + log: + "capcruncher_output/logs/bedgraph_to_bigwig/{sample}_{norm}_{viewpoint}.log", + params: + chrom_sizes=config["genome"]["chrom_sizes"], + shell: + """ + sort -k1,1 -k2,2n {input.bedgraph} > {input.bedgraph}.sorted + bedGraphToBigWig {input.bedgraph}.sorted {params.chrom_sizes} {output.bigwig} + rm {input.bedgraph}.sorted + """ diff --git a/capcruncher/pipeline/workflow/rules/qc.smk b/capcruncher/pipeline/workflow/rules/qc.smk new file mode 100644 index 00000000..941855cb --- /dev/null +++ b/capcruncher/pipeline/workflow/rules/qc.smk @@ -0,0 +1,66 @@ +import os +import pathlib +import capcruncher.pipeline.utils + + +rule fastqc: + input: + fq=lambda wc: FASTQ_SAMPLES.translation[f"{wc.sample}_{wc.read}.fastq.gz"], + output: + qc="capcruncher_output/interim/qc/fastqc/{sample}_{read}_fastqc.html", + params: + outdir=lambda wc, output: str(pathlib.Path(output.qc).parent), + tmpdir="capcruncher_output/interim/qc/fastqc/{sample}_{read}", + basename=lambda wc, output: capcruncher.pipeline.utils.get_fastq_basename( + wc, FASTQ_SAMPLES + ), + threads: 1 + resources: + mem_mb=1024, + log: + "capcruncher_output/logs/fastqc/{sample}_{read}.log", + shell: + """ + mkdir -p {params.tmpdir} && + fastqc -o {params.tmpdir} {input.fq} -t {threads} > {log} 2>&1 && + mv {params.tmpdir}/{params.basename}_fastqc.html {output.qc} && + mv {params.tmpdir}/{params.basename}_fastqc.zip {params.outdir}/{wildcards.sample}_{wildcards.read}_fastqc.zip && + rm -r {params.tmpdir} + """ + + +rule samtools_stats: + input: + bam="capcruncher_output/results/{sample}/{sample}.bam", + bai="capcruncher_output/results/{sample}/{sample}.bam.bai", + output: + stats=temp("capcruncher_output/interim/qc/alignment_raw/{sample}.txt"), + threads: 1 + resources: + mem_mb=1000, + shell: + """samtools stats {input.bam} > {output.stats}""" + + +rule multiqc: + input: + expand( + "capcruncher_output/interim/qc/fastqc/{sample}_{read}_fastqc.html", + sample=SAMPLE_NAMES, + read=[1, 2], + ), + expand( + "capcruncher_output/interim/qc/alignment_raw/{sample}.txt", + sample=SAMPLE_NAMES, + ), + output: + "capcruncher_output/results/full_qc_report.html", + log: + "capcruncher_output/logs/multiqc.log", + params: + outdir=lambda wc, output: str(pathlib.Path(output[0]).parent), + dir_analysis="capcruncher_output/interim/qc", + resources: + mem_mb=1000, + shell: + "multiqc -o {params.outdir} {params.dir_analysis} -n full_qc_report.html --force > {log} 2>&1" diff --git a/capcruncher/pipeline/workflow/rules/statistics.smk b/capcruncher/pipeline/workflow/rules/statistics.smk new file mode 100644 index 00000000..d750fc5a --- /dev/null +++ b/capcruncher/pipeline/workflow/rules/statistics.smk @@ -0,0 +1,206 @@ +from collections import defaultdict +import capcruncher.pipeline.utils +from typing import List + + +def get_digestion_statistics(wc, sample_names: List[str]): + stat_types = { + "read_level_stats": "digestion.read.summary.csv", + "histogram_unfiltered": "digestion.unfiltered.histogram.csv", + "histogram_filtered": "digestion.filtered.histogram.csv", + } + + stat_prefixes = [] + for sample in sample_names: + for combined in ["flashed", "pe"]: + for part in get_rebalanced_parts(wc, combined=combined, sample=sample): + stat_prefixes.append( + f"capcruncher_output/interim/statistics/digestion/data/{sample}_part{part}_{combined}." + ) + + stat_files = defaultdict(list) + for stat_type, stat_suffix in stat_types.items(): + for stat_prefix in stat_prefixes: + stat_files[stat_type].append(stat_prefix + stat_suffix) + + return stat_files + + +def get_filtering_statistics(wc, sample_names: List[str]): + stat_types = { + "read_level_stats": "read.stats.csv", + "slice_level_stats": "slice.stats.csv", + } + + stat_prefixes = [] + for sample in sample_names: + for combined in ["flashed", "pe"]: + for part in get_rebalanced_parts(wc, combined=combined, sample=sample): + stat_prefixes.append( + f"capcruncher_output/interim/statistics/filtering/data/{sample}_part{part}_{combined}." + ) + + stat_files = defaultdict(list) + for stat_type, stat_suffix in stat_types.items(): + for stat_prefix in stat_prefixes: + stat_files[stat_type].append(stat_prefix + stat_suffix) + + return stat_files + + +def get_stat_parts(wc, sample_names: List[str]): + files = [] + for sample in sample_names: + for part in get_fastq_split_1(wc): + files.append( + f"capcruncher_output/interim/statistics/deduplication/data/{sample}_part{part}.deduplication.csv" + ) + return files + + +if not CAPCRUNCHER_TOOLS: + + rule combine_stats_fastq_deduplication: + input: + fastq_deduplication=get_stat_parts, + output: + "capcruncher_output/interim/statistics/deduplication/fastq_deduplication.csv", + script: + "../scripts/combine_deduplication_stats.py" + +else: + + rule combine_stats_fastq_deduplication: + input: + fastq_deduplication=expand( + "capcruncher_output/interim/statistics/deduplication/data/{sample}.deduplication.csv", + sample=SAMPLE_NAMES, + ), + output: + "capcruncher_output/interim/statistics/deduplication/fastq_deduplication.csv", + script: + "../scripts/combine_deduplication_stats.py" + + +rule combine_stats_digestion: + input: + unpack(lambda wc: get_digestion_statistics(wc, SAMPLE_NAMES)), + output: + read_data="capcruncher_output/interim/statistics/digestion/fastq_digestion.csv", + histogram="capcruncher_output/interim/statistics/digestion/fastq_digestion.histogram.csv", + script: + "../scripts/combine_digestion_stats.py" + + +rule combine_stats_filtering: + input: + unpack(lambda wc: get_filtering_statistics(wc, SAMPLE_NAMES)), + output: + read_data="capcruncher_output/interim/statistics/filtering/alignment_filtering.csv", + slice_data="capcruncher_output/interim/statistics/filtering/alignment_filtering_slice.csv", + script: + "../scripts/combine_filtering_stats.py" + + +rule combine_stats_alignment_deduplication: + input: + read_level_stats=expand( + "capcruncher_output/interim/statistics/deduplication_by_coordinate/data/{sample}_{combined}.read.stats.csv", + sample=SAMPLE_NAMES, + combined=["flashed", "pe"], + ), + output: + read_data="capcruncher_output/interim/statistics/deduplication_by_coordinate/alignment_deduplication.csv", + script: + "../scripts/combine_alignment_deduplication_stats.py" + + +rule merge_stats_filtering_and_alignment_deduplication: + input: + filtering=rules.combine_stats_filtering.output.read_data, + alignment_deduplication=rules.combine_stats_alignment_deduplication.output.read_data, + output: + "capcruncher_output/interim/statistics/filtering_and_alignment_deduplication.csv", + log: + "capcruncher_output/logs/merge_stats_filtering_and_alignment_deduplication.log", + shell: + """ + cat {input.filtering} > {output} 2> {log}; + cat {input.alignment_deduplication} | sed '1d' >> {output} 2>> {log}; + """ + + +rule combine_stats_cis_and_trans: + input: + cis_and_trans_stats=expand( + "capcruncher_output/interim/statistics/cis_and_trans_reporters/data/{sample}.reporter.stats.csv", + sample=SAMPLE_NAMES, + ), + output: + cis_and_trans_stats="capcruncher_output/interim/statistics/cis_and_trans_reporters/cis_and_trans_reporters.csv", + log: + "capcruncher_output/logs/statistics/combine_stats_cis_and_trans_stats.log", + script: + "../scripts/combine_cis_and_trans_stats.py" + + +rule combine_stats_read_level: + input: + [ + rules.combine_stats_fastq_deduplication.output[0], + rules.combine_stats_digestion.output.read_data, + rules.merge_stats_filtering_and_alignment_deduplication.output[0], + ], + output: + "capcruncher_output/interim/statistics/run_statistics.csv", + script: + "../scripts/combine_stats_read_level.py" + + +rule copy_report_template: + input: + template=workflow.source_path("../report/capcruncher_report.qmd"), + output: + "capcruncher_output/results/capcruncher_report.qmd", + container: + None + shell: + """ + cp {input.template} {output} + """ + + +rule make_report: + input: + template=rules.copy_report_template.output[0], + fastq_deduplication=rules.combine_stats_fastq_deduplication.output[0], + digestion_read=rules.combine_stats_digestion.output.read_data, + digestion_histogram=rules.combine_stats_digestion.output.histogram, + reporters=rules.merge_stats_filtering_and_alignment_deduplication.output[0], + cis_and_trans_stats=rules.combine_stats_cis_and_trans.output.cis_and_trans_stats, + read_level_stats=rules.combine_stats_read_level.output[0], + output: + "capcruncher_output/results/capcruncher_report.html", + params: + outdir=lambda wildcards, output: pathlib.Path(output[0]).parent, + log: + "capcruncher_output/logs/make_report.log", + shell: + """ + export XDG_RUNTIME_DIR=$(mktemp -d); + quarto \ + render \ + {params.outdir}/capcruncher_report.qmd \ + --to html \ + --execute \ + -P fastq_deduplication_path:$(realpath {input.fastq_deduplication}) \ + -P fastq_digestion_read_path:$(realpath {input.digestion_read}) \ + -P fastq_digestion_hist_path:$(realpath {input.digestion_histogram}) \ + -P reporter_read_path:$(realpath {input.reporters}) \ + -P reporter_cis_trans_path:$(realpath {input.cis_and_trans_stats}) \ + -P run_stats_path:$(realpath {input.read_level_stats}) \ + --log {log} \ + 2> {log}.err; + + rm {params.outdir}/capcruncher_report.qmd + """ diff --git a/capcruncher/pipeline/workflow/rules/visualise.smk b/capcruncher/pipeline/workflow/rules/visualise.smk new file mode 100644 index 00000000..69ba1ae4 --- /dev/null +++ b/capcruncher/pipeline/workflow/rules/visualise.smk @@ -0,0 +1,99 @@ +import capcruncher.pipeline.utils + + +rule viewpoints_to_bigbed: + input: + viewpoints=config["analysis"]["viewpoints"], + params: + chrom_sizes=config["genome"]["chrom_sizes"], + output: + "capcruncher_output/resources/viewpoints/viewpoints.bigBed", + shell: + """ + cat {input.viewpoints} | sort -k1,1 -k2,2n > {output}.tmp + bedToBigBed {output}.tmp {params.chrom_sizes} {output} + rm {output}.tmp + """ + + +rule create_ucsc_hub: + input: + viewpoints=rules.viewpoints_to_bigbed.output[0], + bigwigs=expand( + "capcruncher_output/results/{sample}/bigwigs/{norm}/{sample}_{viewpoint}.bigWig", + sample=SAMPLE_NAMES, + norm=["raw", "norm"], + viewpoint=VIEWPOINT_NAMES, + ), + bigwigs_summary=expand( + "capcruncher_output/results/comparisons/bigwigs/{group}.{method}-summary.{viewpoint}.bigWig", + group=DESIGN["condition"].unique(), + method=SUMMARY_METHODS, + viewpoint=VIEWPOINT_NAMES, + ) + if AGGREGATE_SAMPLES + else [], + bigwigs_comparison=expand( + "capcruncher_output/results/comparisons/bigwigs/{comparison}.{method}-subtraction.{viewpoint}.bigWig", + comparison=[ + f"{a}-{b}" + for a, b in itertools.permutations(DESIGN["condition"].unique(), 2) + ], + method=SUMMARY_METHODS, + viewpoint=VIEWPOINT_NAMES, + ) + if COMPARE_SAMPLES + else [], + report=rules.make_report.output[0], + output: + directory(config["hub"]["dir"]), + wildcard_constraints: + comparison=f"[A-Za-z0-9_\.]+-[A-Za-z0-9_\.]+", + group=f"[A-Za-z0-9_\.]+", + params: + color_by=config["hub"].get("color_by", "sample"), + genome=config["genome"]["name"], + custom_genome=config["hub"].get("custom_genome", None), + genome_twobit=config["genome"].get("twobit", None), + hub_name=config["hub"].get("name"), + hub_short_label=config["hub"].get("short_label"), + hub_long_label=config["hub"].get("long_label"), + hub_email=config["hub"].get("email"), + genome_organism=config["genome"].get("organism"), + genome_default_position=config["genome"].get("genome_default_position"), + script: + "../scripts/make_ucsc_hub.py" + + +rule plot: + input: + unpack( + lambda wc: capcruncher.pipeline.utils.get_files_to_plot( + wc, DESIGN, ASSAY, SAMPLE_NAMES, SUMMARY_METHODS + ) + ), + viewpoints=config["analysis"]["viewpoints"], + output: + template="capcruncher_output/results/figures/{viewpoint}.toml", + fig="capcruncher_output/results/figures/{viewpoint}.pdf", + params: + coordinates=lambda wc: capcruncher.pipeline.utils.get_plotting_coordinates( + wc, config + ), + viewpoint="{viewpoint}", + design=DESIGN, + genes=config["plot"].get("genes", ""), + binsize=config["analysis"].get("bin_sizes", [None])[0], + wildcard_constraints: + comparison=f"[A-Za-z0-9_\.]+-[A-Za-z0-9_\.]+", + group=f"[A-Za-z0-9_\.]+", + log: + "logs/plot/{viewpoint}.log", + threads: 1 + script: + "../scripts/plot.py" + + +localrules: + create_ucsc_hub, + plot, diff --git a/capcruncher/pipeline/workflow/scripts/combine_alignment_deduplication_stats.py b/capcruncher/pipeline/workflow/scripts/combine_alignment_deduplication_stats.py new file mode 100644 index 00000000..2a8341f7 --- /dev/null +++ b/capcruncher/pipeline/workflow/scripts/combine_alignment_deduplication_stats.py @@ -0,0 +1,5 @@ +# ruff: noqa: F821 +from capcruncher.api.statistics import collate_read_data + +df = collate_read_data(snakemake.input.read_level_stats) +df.to_csv(snakemake.output[0], sep=",", index=False) diff --git a/capcruncher/pipeline/workflow/scripts/combine_cis_and_trans_stats.py b/capcruncher/pipeline/workflow/scripts/combine_cis_and_trans_stats.py new file mode 100644 index 00000000..6490addd --- /dev/null +++ b/capcruncher/pipeline/workflow/scripts/combine_cis_and_trans_stats.py @@ -0,0 +1,8 @@ +# ruff: noqa: F821 +from capcruncher.api.statistics import collate_cis_trans_data + +# Collate data +df = collate_cis_trans_data(snakemake.input.cis_and_trans_stats) + +# Write data +df.to_csv(snakemake.output[0], sep=",", index=False) diff --git a/capcruncher/pipeline/workflow/scripts/combine_deduplication_stats.py b/capcruncher/pipeline/workflow/scripts/combine_deduplication_stats.py new file mode 100644 index 00000000..c4c8c982 --- /dev/null +++ b/capcruncher/pipeline/workflow/scripts/combine_deduplication_stats.py @@ -0,0 +1,11 @@ +# ruff: noqa: F821 +from capcruncher.api.statistics import collate_read_data + +# Read in the data +df = collate_read_data(snakemake.input) + +# # Ignore the reads_removed stat_type +# df = df[df["stat_type"] != "reads_removed"] + +# Write out the data +df.to_csv(snakemake.output[0], sep=",", index=False) diff --git a/capcruncher/pipeline/workflow/scripts/combine_digestion_stats.py b/capcruncher/pipeline/workflow/scripts/combine_digestion_stats.py new file mode 100644 index 00000000..5c64a568 --- /dev/null +++ b/capcruncher/pipeline/workflow/scripts/combine_digestion_stats.py @@ -0,0 +1,22 @@ +# ruff: noqa: F821 +import pandas as pd +from capcruncher.api.statistics import collate_read_data, collate_histogram_data + +# Collate data +df_read_data = collate_read_data(snakemake.input.read_level_stats) +df_histogram_unfiltered = collate_histogram_data(snakemake.input.histogram_unfiltered) +df_histogram_filtered = collate_histogram_data(snakemake.input.histogram_filtered) + +# Merge filtered and unfiltered histograms +df_hist = pd.concat( + [ + df_histogram_unfiltered.assign(filtered=0), + df_histogram_filtered.assign(filtered=1), + ] +).sort_values(["sample", "read_type", "n_slices"]) + +# Output histogram, slice and read statics +df_hist.to_csv(snakemake.output.histogram, index=False) + +# Output read data +df_read_data.to_csv(snakemake.output.read_data, sep=",", index=False) diff --git a/capcruncher/pipeline/workflow/scripts/combine_filtering_stats.py b/capcruncher/pipeline/workflow/scripts/combine_filtering_stats.py new file mode 100644 index 00000000..d5cef7fd --- /dev/null +++ b/capcruncher/pipeline/workflow/scripts/combine_filtering_stats.py @@ -0,0 +1,10 @@ +# ruff: noqa: F821 +from capcruncher.api.statistics import collate_read_data, collate_slice_data + +# Collate data +df_read_data = collate_read_data(snakemake.input.read_level_stats) +df_slice_data = collate_slice_data(snakemake.input.slice_level_stats) + +# Write data +df_read_data.to_csv(snakemake.output.read_data, sep=",", index=False) +df_slice_data.to_csv(snakemake.output.slice_data, sep=",", index=False) diff --git a/capcruncher/pipeline/workflow/scripts/combine_stats_read_level.py b/capcruncher/pipeline/workflow/scripts/combine_stats_read_level.py new file mode 100644 index 00000000..113ba403 --- /dev/null +++ b/capcruncher/pipeline/workflow/scripts/combine_stats_read_level.py @@ -0,0 +1,11 @@ +# ruff: noqa: F821 +import pandas as pd + +from capcruncher.utils import read_dataframes + + +dframes = read_dataframes(snakemake.input) +df = pd.concat(dframes) +df.sort_values(["sample", "read_type", "stat"], ascending=[True, True, False]).to_csv( + snakemake.output[0], sep=",", index=False +) diff --git a/capcruncher/pipeline/workflow/scripts/count_identified_viewpoints.py b/capcruncher/pipeline/workflow/scripts/count_identified_viewpoints.py new file mode 100644 index 00000000..6314e21f --- /dev/null +++ b/capcruncher/pipeline/workflow/scripts/count_identified_viewpoints.py @@ -0,0 +1,24 @@ +import os +import sys +import pandas as pd +import numpy as np +import subprocess + +from capcruncher import api +import ibis + + +ibis.options.interactive = False + +con = ibis.duckdb.connect(threads=snakemake.threads) +tbl = con.register(f"parquet://{snakemake.params.slices_dir}", table_name="reporters") +unique_viewpoints = (tbl[["viewpoint", "pe"]] + .distinct() + .execute(limit=None) + .replace("", pd.NA) + .dropna() +) + +unique_viewpoints.to_csv(snakemake.output[0], sep="\t", index=False) + + diff --git a/capcruncher/pipeline/workflow/scripts/identify_viewpoints_with_interactions.py b/capcruncher/pipeline/workflow/scripts/identify_viewpoints_with_interactions.py new file mode 100644 index 00000000..2c8ff594 --- /dev/null +++ b/capcruncher/pipeline/workflow/scripts/identify_viewpoints_with_interactions.py @@ -0,0 +1,28 @@ +# ruff: noqa: F821 +import os +import sys +import cooler +import pandas as pd +from collections import defaultdict +import ujson as json +import pathlib + + +coolers = snakemake.input[0] +samples = snakemake.params.samples + +for (sample, clr) in zip(samples, coolers): + viewpoints_per_sample = [] + # Check all groups in the cooler file to see if they have any counts + viewpoints = cooler.api.list_coolers(clr) + + for viewpoint in viewpoints: + clr = cooler.Cooler(f"{clr}::{viewpoint}") + count = clr.pixels()[:].sum() + + if count > 0: + viewpoints_per_sample.append(viewpoint) + + # Save the viewpoints with counts to a file + with open(pathlib.Path(snakemake.params.outdir) / f"{sample}.json", "w") as f: + json.dump(viewpoints_per_sample, f) diff --git a/capcruncher/pipeline/workflow/scripts/make_ucsc_hub.py b/capcruncher/pipeline/workflow/scripts/make_ucsc_hub.py new file mode 100644 index 00000000..3d6a5b4f --- /dev/null +++ b/capcruncher/pipeline/workflow/scripts/make_ucsc_hub.py @@ -0,0 +1,93 @@ +# ruff: noqa: F821 + +import os +import pandas as pd +import itertools +import numpy as np +import pathlib +import re +from loguru import logger +import trackhub +import tracknado + +# Single bigwigs +df_bw = pd.DataFrame( + [pathlib.Path(p) for p in snakemake.input.bigwigs], + columns=["fn"], +) + +df_bw["basename"] = df_bw["fn"].apply(lambda p: p.name) +df_bw["normalisation"] = df_bw["fn"].apply(lambda p: p.parent) +df_bw[["sample", "viewpoint"]] = df_bw["basename"].str.extract( + "(?P.*?)_(?P.*?).bigWig" +) +df_bw["category"] = "Replicates" + +# Summarised bigwigs +df_bw_summary = pd.DataFrame( + [pathlib.Path(p) for p in snakemake.input.bigwigs_summary], + columns=["fn"], +) +df_bw_summary["basename"] = df_bw_summary["fn"].apply(lambda p: p.name) +df_bw_summary["normalisation"] = "norm" +df_bw_summary[["sample", "aggregation", "viewpoint"]] = df_bw_summary[ + "basename" +].str.extract("(?P.*?)\.(?P.*?)(?P.*?).bigWig") +df_bw_summary["category"] = "Aggregated" + +# Compared bigwigs +df_bw_compared = pd.DataFrame( + [pathlib.Path(p) for p in snakemake.input.bigwigs_comparison], + columns=["fn"], +) +df_bw_compared["basename"] = df_bw_compared["fn"].apply(lambda p: p.name) +df_bw_compared["normalisation"] = "norm" +df_bw_compared[["sample", "aggregation", "viewpoint"]] = df_bw_compared[ + "basename" +].str.extract("(.*?)\.(.*?)-subtraction\.(.*?).bigWig") +df_bw_compared["category"] = "Subtraction" + +# Combine dataframes +df = pd.concat([df_bw, df_bw_summary, df_bw_compared], axis=0) + +# Create hub design +design = tracknado.TrackDesign.from_design( + df, + color_by=snakemake.params.color_by, + subgroup_by=["sample", "viewpoint", "aggregation"], + supergroup_by=[ + "category", + ], + overlay_by=[ + "sample", + ], +) + +hub = tracknado.HubGenerator( + track_design=design, + genome=snakemake.params.genome, + hub_name=snakemake.params.hub_name, + description_html=pathlib.Path(snakemake.input.report), + hub_email=snakemake.params.hub_email, + custom_genome=snakemake.params.custom_genome, + genome_twobit=snakemake.params.genome_twobit, + genome_organism=snakemake.params.genome_organism, + genome_default_position=snakemake.params.genome_default_position, + outdir=snakemake.output[0], +) + +hub.trackdb.add_tracks( + trackhub.Track( + name="viewpoint", + tracktype="bigBed", + source=snakemake.input.viewpoints, + visibility="dense", + color="0,0,0", + autoScale="off", + maxHeightPixels="100:50:8", + shortLabel="Viewpoint", + longLabel="Viewpoint", + ) +) + +hub.stage_hub() diff --git a/capcruncher/pipeline/workflow/scripts/plot.py b/capcruncher/pipeline/workflow/scripts/plot.py new file mode 100644 index 00000000..72024686 --- /dev/null +++ b/capcruncher/pipeline/workflow/scripts/plot.py @@ -0,0 +1,71 @@ +# ruff: noqa: F821 + +import os +import pathlib +import sys + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from loguru import logger + +import capcruncher.api.plotting as cp + +logger.add(snakemake.log[0], format="{time} {level} {message}", level="INFO") + +with logger.catch(): + # Set-up tracks + tracks = [] + + # Add scale bar + tracks.append(cp.CCTrack(None, file_type="scale")) + + # Bigwig tracks + if snakemake.input.bigwigs: + + for bw in snakemake.input.bigwigs: + bw_path = pathlib.Path(bw) + tracks.append(cp.CCTrack(bw, file_type='bigwig', title=bw_path.stem)) + tracks.append(cp.CCTrack(None, file_type='spacer')) + + # Add subtractions if available + if snakemake.input.subtractions: + for sub in snakemake.input.subtractions: + sub_path = pathlib.Path(sub) + tracks.append( + cp.CCTrack(sub, file_type='bigwig', title=sub_path.stem, min="auto") + ) + tracks.append(cp.CCTrack(None, file_type='spacer')) + + # Add heatmaps if available + if snakemake.input.heatmaps: + for hm in snakemake.input.heatmaps: + hm_path = pathlib.Path(hm) + tracks.append( + cp.CCTrack( + hm, + file_type='heatmap', + title=hm_path.stem, + binsize=snakemake.params.binsize, + viewpoint=snakemake.params.viewpoint, + style="triangular", + ) + ) + tracks.append(cp.CCTrack(None, file_type='spacer')) + + # Add genes if available + if snakemake.params.genes: + genes = snakemake.params.genes + genes_path = pathlib.Path(genes) + tracks.append(cp.CCTrack(genes, file_type='genes')) + tracks.append(cp.CCTrack(None, file_type='spacer')) + + # Add X-axis + tracks.append(cp.CCTrack(None, file_type='xaxis')) + + # Make figure and save + fig = cp.CCFigure(tracks) + fig.save(snakemake.params.coordinates, output=snakemake.output.fig) + + # Export template used to make figure + fig.to_toml(snakemake.output.template) diff --git a/capcruncher/pipeline/workflow/scripts/remove_duplicate_coordinates.py b/capcruncher/pipeline/workflow/scripts/remove_duplicate_coordinates.py new file mode 100644 index 00000000..cc66f2ca --- /dev/null +++ b/capcruncher/pipeline/workflow/scripts/remove_duplicate_coordinates.py @@ -0,0 +1,51 @@ +# ruff: noqa: F821 + +import os +import sys +import pandas as pd +import numpy as np +import subprocess +import pyarrow.dataset as ds +import pathlib +from loguru import logger + + +# Check if the input dataset is not empty +try: + dataset = ds.dataset(snakemake.input.slices_directory, format="parquet") + n_rows = dataset.count_rows() + + if n_rows != 0: + cmd = [ + "capcruncher", + "interactions", + "deduplicate", + snakemake.input.slices_directory, + "-o", + snakemake.output.slices, + "--read-type", + snakemake.params.read_type, + "--sample-name", + snakemake.params.sample_name, + "--stats-prefix", + snakemake.params.stats_prefix, + ] + + with open(snakemake.log[0], "w") as f: + subprocess.run(cmd, check=True, stdout=f, stderr=f) + + else: + logger.warning("The input dataset is empty, skipping deduplication step.") + + outdir = pathlib.Path(snakemake.output.slices) + + logger.warning(f"Creating empty output directory: {outdir}") + outdir.mkdir(parents=True, exist_ok=True) + + logger.warning(f"Creating empty stats file: {snakemake.output.stats_read}") + pd.DataFrame().to_csv(snakemake.output.stats_read) + + +except Exception as e: + print(e) + os.makedirs(snakemake.output.slices, exist_ok=True) diff --git a/capcruncher/tools/__init__.py b/capcruncher/tools/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/capcruncher/tools/annotate.py b/capcruncher/tools/annotate.py deleted file mode 100644 index 3e45dc97..00000000 --- a/capcruncher/tools/annotate.py +++ /dev/null @@ -1,156 +0,0 @@ -from logging import disable -import logging -from typing import Union, Literal -import warnings -import numpy as np - -warnings.simplefilter("ignore", category=RuntimeWarning) - -import pandas as pd -from pybedtools import BedTool - -from capcruncher.utils import ( - convert_bed_to_dataframe, - convert_to_bedtool, - is_valid_bed, - split_intervals_on_chrom, -) - - -class BedIntersection: - - """ - Performs intersection between two named bed files. - - Wrapper around bedtools intersect designed to intersect in parallel - (by splitting file based on chromosome) and handle malformed bed files. - - Attributes: - bed1 (Union[str, BedTool, pd.DataFrame]): Bed file to intersect. Must be named. - bed2 (Union[str, BedTool, pd.DataFrame]): Bed file to intersect. - intersection_name (str): Name for intersection. - min_frac (float): Minimum fraction required for intersection - n_cores (int): Number of cores for parallel intersection. - invalid_bed_action: Method to deal with missing/malformed bed files ("ignore"|"error") - - """ - - def __init__( - self, - bed1: Union[str, BedTool, pd.DataFrame], - bed2: Union[str, BedTool, pd.DataFrame], - intersection_name: str = "count", - intersection_method: str = "count", - intersection_min_frac: float = 1e-9, - invalid_bed_action: Literal["ignore", "error"] = "error", - dtype: str = "str", - ): - """ - - Args: - bed1 (Union[str, BedTool, pd.DataFrame]): Bed file 1 for intersection - bed2 (Union[str, BedTool, pd.DataFrame]): Bed file 2 for intersection - intersection_name (str, optional): Name for intersection. Defaults to "count". - intersection_method (str, optional): Method for intersection. Choose from (get|count). Defaults to "count". - intersection_min_frac (float, optional): Minimum overlap fraction required. Defaults to 1e-9. - n_cores (int, optional): Number of cores to use for intersection. Defaults to 1. - invalid_bed_action (str, optional): If set to 'ignore' will return default N/A value or raise Exception. Defaults to "error". - """ - - # Format input bed files - self.bed1 = bed1 - self.bed2 = bed2 - - self.bed1_valid = is_valid_bed(bed1) - self.bed2_valid = is_valid_bed(bed2) - - # Default methods and NA values - self._methods = { - "get": self._intersections_get, - "count": self._intersections_count, - } - - self._methods_na = {"get": ".", "count": 0} - - # Get intersection properties - self.intersection_name = intersection_name - self._intersection_method = self._methods.get(intersection_method) - self._intersection_na = self._methods_na[intersection_method] - self.min_frac = intersection_min_frac - - # Other options - self.invalid_bed_action = invalid_bed_action - self.dtype = dtype - - def _extract_intersection_result( - self, - bt: BedTool, - intersection_id: str, - dtype: Union[str, pd.CategoricalDtype] = "str", - ): - try: - columns = pd.read_csv(bt.fn, nrows=5, sep="\t", header=None).columns - columns_needed = [columns[3], columns[-1]] - - return pd.read_csv( - bt.fn, - sep="\t", - header=None, - usecols=columns_needed, - names=["parent_name", intersection_id], - index_col="parent_name", - dtype={"parent_name": str, intersection_id: dtype}, - na_values=".", - ).squeeze("columns") - except pd.errors.EmptyDataError: - return self._format_invalid_intersection(self.bed1, dtype=dtype) - - def _intersections_count(self, a, b): - return self._extract_intersection_result( - a.intersect(b, c=True, f=self.min_frac), - intersection_id=self.intersection_name, - dtype=self.dtype, - ) - - def _intersections_get(self, a, b): - if self.dtype == "category": - b_names = b.to_dataframe().iloc[:, -1].unique() - dtype = pd.CategoricalDtype(b_names) - else: - dtype = self.dtype - - return self._extract_intersection_result( - a.intersect(b, wb=True, f=self.min_frac), - intersection_id=self.intersection_name, - dtype=dtype, - ) - - def _format_invalid_intersection(self, bed, dtype=float): - return ( - convert_bed_to_dataframe(bed) - .assign(**{self.intersection_name: np.nan}) - .set_index("name") - .loc[:, self.intersection_name] - .astype(dtype) - ) - - def get_intersection(self, sort_index: bool = True) -> pd.Series: - """Intersects the two bed files and returns a pd.Series.""" - - logging.info(f"Performing {self.intersection_name} intersection.") - - if all([self.bed1_valid, self.bed2_valid]): - - a = convert_to_bedtool(self.bed1) - b = convert_to_bedtool(self.bed2) - intersection = self._intersection_method(a, b) - - elif self.invalid_bed_action == "ignore": - intersection = self._format_invalid_intersection(self.bed1) - - else: - raise ValueError( - f"Slices valid: {self.bed1_valid}\n {self.intersection_name} .bed file valid: {self.bed2_valid}" - ) - - return intersection.sort_index() if sort_index else intersection diff --git a/capcruncher/tools/deduplicate.py b/capcruncher/tools/deduplicate.py deleted file mode 100644 index 2b8b28e9..00000000 --- a/capcruncher/tools/deduplicate.py +++ /dev/null @@ -1,471 +0,0 @@ -import functools -import logging -import multiprocessing -import os -import pickle -import queue -from collections import namedtuple -from multiprocessing import Process, Queue, SimpleQueue -from typing import Iterable, List, Literal, NamedTuple, Tuple, Union - -import pandas as pd -import ujson -import xxhash -from capcruncher.utils import get_file_type, hash_column, save_dict -from xopen import xopen - - -class ReadDeduplicationParserProcess(Process): - """ - Process subclass for parsing fastq file(s) into a hashed {id:sequence} json format. - - Attributes: - inq: Input read queue - outq: Output read queue (Not currently used) - hash_seed: Seed for xxhash64 algorithm to ensure consistency - save_hash_dict_path: Path to save hashed dictionary - """ - - def __init__( - self, - inq: multiprocessing.Queue, - hash_seed: int = 42, - output_path: os.PathLike = "parsed.json", - ): - """ - Args: - inq (multiprocessing.SimpleQueue): Input queue for fastq reads. - outq (multiprocessing.SimpleQueue): Output queue for processed reads. - Only used if part of a pipeline of reader -> parser -> writer - hash_seed (int, optional): Seed to use for hashing. Defaults to 42. - save_hashed_dict_path (os.PathLike, optional): Path to save hashed reads in json format. Defaults to "parsed.json". - """ - - self.inq = inq - self.hash_seed = hash_seed - self.output_path = output_path - - super(ReadDeduplicationParserProcess, self).__init__() - - def run(self): - """Processes fastq reads from multiple files and generates a hashed json dictionary. - - Dictionary is hashed and in the format {(read 1 name + read 2 name): (sequence 1 + sequence 2)}. - - Output path is specified by save_hashed_dict_path. - - """ - - hash_seed = self.hash_seed - hash_function = functools.partial(xxhash.xxh64_intdigest, seed=hash_seed) - records = dict() - - while True: - - try: - reads = self.inq.get(block=True, timeout=0.01) - - if reads: - - for read_set in reads: - hash_sequence = hash_function( - "".join([r.sequence for r in read_set]) - ) - hash_id = hash_function("".join([r.name for r in read_set])) - records[hash_id] = hash_sequence - - else: - break - - except queue.Empty: - continue - - output_format = get_file_type(self.output_path) - save_dict(records, self.output_path, output_format) - - -RemovalStatistics = namedtuple( - "RemovalStatistics", ["reads_total", "reads_unique", "reads_removed"] -) - - -class ReadDuplicateRemovalProcess(Process): - """ - Process subclass for parsing fastq file(s) and removing identified duplicates. - - Attributes: - inq: Input read queue - outq: Output queue for deduplicated reads. - duplicated_ids: Concatenated read ids to remove from input fastq files. - statq: Output queue for statistics. - reads_total: Number of fastq reads processed. - reads_unique: Number of non-duplicated reads output. - hash_seed: Seed for xxhash algorithm. MUST be the same as used by ReadDuplicationParserProcess. - """ - - def __init__( - self, - inq: multiprocessing.Queue, - outq: multiprocessing.Queue, - stats_tx: multiprocessing.Pipe, - duplicated_ids: set, - hash_seed: int = 42, - hash_read_name: bool = True, - ): - """ - Args: - inq (multiprocessing.SimpleQueue): Input queue for reads to be deduplicated. - outq (multiprocessing.SimpleQueue): Output queue for deduplicated reads. - duplicated_ids (set): Hashed read ids to be removed if encountered. - statq (multiprocessing.Queue, optional): Output queue for statistics. Defaults to None. - hash_seed (int, optional): Seed for xxhash algorithm. Defaults to 42. - """ - - self.inq = inq - self.outq = outq - self.hash_seed = hash_seed - self.duplicated_ids = duplicated_ids - - # Misc - self.hash_read_name = hash_read_name - - # Stats - self.stats_tx = stats_tx - self.reads_total = 0 - self.reads_unique = 0 - - super(ReadDuplicateRemovalProcess, self).__init__() - - def run(self): - - """Performs read deduplication based on sequence. - - Unique reads are placed on outq and deduplication statistics are placed on statq. - - """ - - hash_seed = self.hash_seed - hash_read_name = self.hash_read_name - hash_function = functools.partial(xxhash.xxh64_intdigest, seed=hash_seed) - duplicated_ids = self.duplicated_ids - reads_unique = list() - - while True: - - try: - reads = self.inq.get(block=True, timeout=0.01) - - if reads: - for read_glob in reads: - - hash_id = hash_function("".join([r.name for r in read_glob])) - - if not hash_id in duplicated_ids: - if hash_read_name: - for r in read_glob: - r.name = str(hash_function(r.name)) - - reads_unique.append(read_glob) - - self.reads_total += len(reads) - self.reads_unique += len(reads_unique) - self.outq.put(reads_unique.copy()) - reads_unique.clear() - - else: - break - - except queue.Empty: - continue - - stats = RemovalStatistics( - self.reads_total, self.reads_unique, self.reads_total - self.reads_unique - ) - self.stats_tx.send(stats) - - -def extract_fragment_coordinates(df): - - try: - coords = df["coordinates"].str.extract( - r"^chr(?P.*?):(?P\d+).*\|chr(?P.*?):\d+-(?P\d+)" - ) - coords = coords["chrom1"].str.cat(coords.iloc[:, 1:]) - coords.name = "coordinates_pe" - except AttributeError as e: # Might not be any data - coords = pd.Series(data=[], name="coordinates_pe", dtype="object") - - return coords - - -def identify_coordinate_duplicates_from_tsv( - fragments: pd.DataFrame, read_type: Literal["flashed", "pe"], buffer: float = 1e6 -): - - df_fragments = pd.read_csv( - fragments, - sep="\t", - chunksize=buffer, - usecols=["parent_read", "coordinates"], - ) - - unique_coordinates = set() - duplicated_fragments = set() - - for ii, df in enumerate(fragments): - - # Shuffle to stop fragments at the end of the sample always being removed - df = df.sample(frac=1) - - if read_type == "flashed": - - lst_id = hash_column(df["parent_read"]) - lst_coords = hash_column(df["coordinates"]) - - else: - # Extract chrom1 + start1 + chrom(last entry) + end(last entry) - coords_df = df["coordinates"].str.extract( - r"^chr(?P.*?):(?P\d+).*\|chr(?P.*?):\d+-(?P\d+)" - ) - - lst_id = hash_column(df["parent_read"]) - - # chrom1+start1+chrom-1+end-1(hashed) - lst_coords = hash_column(coords_df["chrom1"].str.cat(coords_df.iloc[:, 1:])) - - for id, coord in zip(lst_id, lst_coords): - if not coord in unique_coordinates: - unique_coordinates.add(coord) - else: - duplicated_fragments.add(id) - - return duplicated_fragments - - -def identify_coordinate_duplicates_from_hdf5( - fragments: list, read_type: Literal["flashed", "pe"] -): - - import dask.dataframe as dd - - try: - df_fragments_coords = dd.read_hdf( - fragments, key=f"/fragments", columns=["id", "coordinates"] - ) - - if read_type == "flashed": - - duplicated_ids = ( - df_fragments_coords.map_partitions( - lambda df: df.assign(coordinates=hash_column(df["coordinates"])) - ) - .shuffle(on="coordinates") - .map_partitions(lambda df: df[df.duplicated(subset="coordinates")])[ - "id" - ] - ) - elif read_type == "pe": - - duplicated_ids = ( - df_fragments_coords.map_partitions( - lambda df: df.join(extract_fragment_coordinates(df))[ - ["id", "coordinates_pe"] - ] - # .assign(coordinates_pe=lambda df: hash_column(df["coordinates_pe"])) - ) - .shuffle(on="coordinates_pe") - .map_partitions(lambda df: df[df.duplicated(subset="coordinates_pe")])[ - "id" - ] - ) - - except KeyError as e: - logging.warn("{e}") - duplicated_ids = dd.from_pandas(pd.Series(data=[], name="id"), npartitions=1) - - return duplicated_ids - - -def identify_coordinate_duplicates_from_parquet( - fragments: list, read_type: Literal["flashed", "pe"] -): - - import dask.dataframe as dd - - df_fragments_coords = dd.read_parquet( - fragments, - columns=["id", "coordinates"], - chunksize="100MB", - aggregate_files=True, - engine="pyarrow", - ) - - if read_type == "flashed": - - ids_duplicated = df_fragments_coords.shuffle(on="coordinates").map_partitions( - lambda df: df[df.duplicated(subset="coordinates")] - )["id"] - - elif read_type == "pe": - - ids_duplicated = ( - df_fragments_coords.map_partitions( - lambda df: df.join(extract_fragment_coordinates(df))[ - ["id", "coordinates_pe"] - ] - ) - .shuffle(on="coordinates_pe") - .map_partitions(lambda df: df[df.duplicated(subset="coordinates_pe")])["id"] - ) - - return ids_duplicated - - -def remove_duplicates_from_tsv( - slices: os.PathLike, output: os.PathLike, duplicated_ids: os.PathLike, buffer=1e6 -): - - # Remove output if it exist as will need to append to file. - if os.path.exists(output): - os.unlink(output) - - df_slices = pd.read_csv(slices, sep="\t", chunksize=buffer) - ids_duplicated = read_duplicated_ids(duplicated_ids) - - n_reads_total = 0 - n_reads_unique = 0 - - # Iterate slices in chunks - for ii, df in enumerate(df_slices): - - logging.info(f"Processed {(ii + 1) * buffer} slices") - - n_reads_total += df["parent_read"].nunique() - - # Hash the parent_read column and remove any duplicated ids. - df = ( - df.assign(parent_read_hashed=lambda df: hash_column(df["parent_read"])) - .set_index("parent_read_hashed") - .loc[lambda df: ~df.index.isin(ids_duplicated)] - ) - - n_reads_unique += df["parent_read"].nunique() - - # Append to file. - df.reset_index(drop=True).to_csv( - output, sep="\t", index=None, header=True if ii < 1 else False, mode="a" - ) - - return (n_reads_total, n_reads_unique) - - -def remove_duplicates_from_hdf5( - slices: Iterable, duplicated_ids: pd.Series, output: os.PathLike -) -> Tuple[int, int]: - - import dask.dataframe as dd - - n_slices_total = 0 - - try: - # Need to get total number of slices - for slice_file in slices: - with pd.HDFStore(slice_file, "r") as store: - n_slices_total += store.get_storer("slices").nrows - - ddf = dd.read_hdf(slices, "slices").map_partitions( - lambda df: df.loc[~(df["parent_id"].isin(duplicated_ids))] - ) - - ddf.to_hdf( - output, - key="slices", - format="table", - data_columns=["viewpoint"], - mode="w", - min_itemsize={ - "slice_name": 75, - "parent_read": 75, - "coordinates": 75, - "chrom": 25, - }, - complib="blosc", - complevel=2, - ) - - # Need to get final number of slices - with pd.HDFStore(output, "r") as store: - n_slices_unique = store.get_storer(f"slices").nrows - - except KeyError as e: - # Obviously missing any data (due to filtering) - n_slices_total = 0 - n_slices_unique = 0 - return (n_slices_total, n_slices_unique) - - -def remove_duplicates_from_parquet( - slices: Iterable, duplicated_ids: pd.Series, output: os.PathLike -) -> Tuple[int, int]: - - import dask.dataframe as dd - import pyarrow.parquet as pq - import pyarrow.dataset as ds - import pyarrow as pa - - if not duplicated_ids.empty: - duplicates = set(duplicated_ids.values) - else: - duplicates = set() - - n_reads_total = ( - dd.read_parquet(slices, columns=["parent_id"], engine="pyarrow") - ["parent_id"] - .nunique() - .compute() - ) - - logging.info("Loading and filtering slices") - - # Load and filter data - slice_dataset = ds.dataset( - list(slices), - format="parquet", - ) - - slice_dataset_scanner = slice_dataset.scanner(filter=~ds.field('parent_id').isin(duplicates)) - - - logging.info("Writing unique slices") - ds.write_dataset(slice_dataset_scanner, output, format="parquet", partitioning_flavor="hive") - - n_reads_unique = ( - dd.read_parquet(output, columns=["parent_id"], engine="pyarrow") - ["parent_id"] - .nunique() - .compute() - ) - return (n_reads_total, n_reads_unique) - - -def read_duplicated_ids(path: os.PathLike): - - from xopen import xopen - - file_type = get_file_type(path) - - if file_type == "json": - with xopen.xopen(path, "r") as r: - ids_duplicated = {int(x) for x in ujson.load(r)} - - elif file_type == "hdf5": - - try: - ids_duplicated = pd.read_hdf(path, key="/duplicated_ids") - except KeyError: - ids_duplicated = pd.Series(data=["NO_DATA"], name="/duplicated_ids") - - elif file_type == "pickle": - ids_duplicated = pd.read_pickle(path) - - return ids_duplicated diff --git a/capcruncher/tools/statistics.py b/capcruncher/tools/statistics.py deleted file mode 100644 index f6123ccf..00000000 --- a/capcruncher/tools/statistics.py +++ /dev/null @@ -1,96 +0,0 @@ -import pandas as pd -import re -from collections import namedtuple -from capcruncher.utils import read_dataframes - - -class DeduplicationStatistics(): - def __init__(self, - sample: str, - read_type: str = 'pe', - reads_total: int=0, - reads_unique: int=0): - - self.sample = sample - self.read_type = read_type - self.reads_total = reads_total - self.reads_unique = reads_unique - self.reads_removed = reads_total - reads_unique - - @property - def df(self): - df = pd.DataFrame() - df['stat'] = [self.reads_total, self.reads_unique, self.reads_removed] - df['stat_type'] = ['reads_total', 'reads_unique', 'reads_removed'] - df['read_type'] = self.read_type - df['read_number'] = 0 - df['stage'] = 'deduplication' - df['sample'] = self.sample - return df - -DigestionStats = namedtuple('DigestionStats', ['read_type', 'read_number', 'unfiltered', 'filtered']) - -def collate_histogram_data(fnames): - return ( - pd.concat(read_dataframes(fnames)) - .groupby(["sample", "read_type", "read_number", "filtered", "n_slices"]) - .sum() - .reset_index() - .sort_values(["sample", "read_type", "n_slices"]) - ) - -def collate_read_data(fnames): - - df = (pd.concat(read_dataframes(fnames)) - .query('(read_number == 0) or (read_number == 1)') - .groupby(["sample", "stage", "read_type", "read_number", "stat_type"])["stat"] - .sum() - .reset_index() - ) - - return df.sort_values(["sample", "stat"], ascending=[True, False]) - -def collate_slice_data(fnames): - - df = pd.concat(read_dataframes(fnames)) - aggregations = {col: 'sum' if not 'unique' in col else 'max' for col in df.columns - if not col in ['sample', 'stage', 'read_type']} - - return (df.groupby(["sample", "stage", "read_type"]) - .agg(aggregations) - .reset_index() - .sort_values(["sample", "unique_slices"], ascending=[True, False]) - ) - -def collate_cis_trans_data(fnames): - return (pd.concat(read_dataframes(fnames)) - .groupby(['sample', 'viewpoint', 'read_type', 'cis/trans']) - .sum() - .reset_index() - .sort_values(["sample", "read_type", 'count'], ascending=[True, True, False])) - -def extract_trimming_stats(fn): - stat_regexes = { - 'reads_total': re.compile(r'^Total reads processed:\s+([0-9,]+)$'), - 'adapters_removed': re.compile(r'Reads with adapters:\s+([0-9,]+).*'), - 'reads_after_filtering': re.compile(r'Reads written \(passing filters\):\s+([0-9,]+).*'),} - - sample_re_match = re.match(r'.*/(.*)_part\d+_(1|2).*', fn) - - stats = {} - stats['sample'] = sample_re_match.group(1) - stats['read_number'] = sample_re_match.group(2) - stats['read_type'] = 'pe' - - - with open(fn) as r: - for line in r: - for stat_name, pattern in stat_regexes.items(): - regex_match = pattern.match(line) - if regex_match: - stats[stat_name] = int(regex_match.group(1).replace(',', '')) - - stats['reads_filtered'] = stats['reads_total'] - stats['reads_after_filtering'] - - return stats - \ No newline at end of file diff --git a/capcruncher/tools/storage.py b/capcruncher/tools/storage.py deleted file mode 100644 index 95348962..00000000 --- a/capcruncher/tools/storage.py +++ /dev/null @@ -1,584 +0,0 @@ -import os -import sys -from click.types import STRING -import pandas as pd -import numpy as np -from pybedtools import BedTool -import cooler -import h5py -from joblib import Parallel, delayed -from typing import Union, Literal -from natsort import natsorted, natsort_key -from capcruncher.utils import split_intervals_on_chrom, intersect_bins -import itertools -import logging - - -def get_viewpoint_coords(viewpoint_file: str, viewpoint_name: str): - df_viewpoints = BedTool(viewpoint_file).to_dataframe() - df_viewpoints = df_viewpoints.query(f'name == "{viewpoint_name}"') - - try: - viewpoints = [row for index, row in df_viewpoints.iterrows()] - except IndexError: - logging.error("Oligo name cannot be found within viewpoints") - viewpoints = None - - return viewpoints - - -def get_viewpoint_bins(bins, viewpoint_chrom, viewpoint_start, viewpoint_end): - - return [ - int(b) - for b in bins.query( - f'chrom == "{viewpoint_chrom}" and start >= {viewpoint_start} and end <= {viewpoint_end}' - )["name"] - ] - -def create_cooler_cc( - output_prefix: str, - bins: pd.DataFrame, - pixels: pd.DataFrame, - viewpoint_name: str, - viewpoint_path: os.PathLike, - viewpoint_bins: Union[int, list] = None, - suffix=None, - **cooler_kwargs, -) -> os.PathLike: - """ - Creates a cooler hdf5 file or cooler formatted group within a hdf5 file. - - Args: - output_prefix (str): Output path for hdf5 file. If this already exists, will append a new group to the file. - bins (pd.DataFrame): DataFrame containing the genomic coordinates of all bins in the pixels table. - pixels (pd.DataFrame): DataFrame with columns: bin1_id, bin2_id, count. - viewpoint_name (str): Name of viewpoint to store. - viewpoint_path (os.PathLike): Path to viewpoints used for the analysis. - viewpoint_bins (Union[int, list], optional): Bins containing viewpoint. Can be determined from viewpoint_path. Defaults to None. - suffix (str, optional): Suffix to append before the .hdf5 file extension. Defaults to None. - - Raises: - ValueError: Viewpoint name must exactly match the a supplied viewpoint. - - Returns: - os.PathLike: Path of cooler hdf5 file. - """ - - # Gets viewpoint coordinates - viewpoint_coords = get_viewpoint_coords(viewpoint_path, viewpoint_name) - - # Make sure viewpoint coordinates are returned correctly, if not, error. - if viewpoint_coords is None: - raise ValueError(f"Incorrect viewpoint name specified: {viewpoint_name}.") - - # If viewpoint bins not provided get them using the coordinates. - if not viewpoint_bins: - viewpoint_bins = list( - itertools.chain.from_iterable( - [ - get_viewpoint_bins(bins, c["chrom"], c["start"], c["end"]) - for c in viewpoint_coords - ] - ) - ) - - # Need to store bins as a list so make sure its not just a single int. - elif isinstance(viewpoint_bins, int): - viewpoint_bins = [ - int(viewpoint_bins), - ] - - # The cooler.create_cooler function will not accept np.arrays so must convert to python list - elif isinstance(viewpoint_bins, (np.array, pd.Series)): - viewpoint_bins = [int(x) for x in viewpoint_bins] - - # Get the number of cis interactions, required for normalisation. - bins_cis = bins.loc[ - lambda df: df["chrom"].isin([c["chrom"] for c in viewpoint_coords]) - ]["name"].loc[lambda ser: ~ser.isin(viewpoint_bins)] - - pixels_cis = pixels.loc[ - lambda df: (df["bin1_id"].isin(bins_cis)) | (df["bin2_id"].isin(bins_cis)) - ] - n_cis_interactions = pixels_cis["count"].sum() - - # Metadata for cooler file. - metadata = { - "viewpoint_bins": viewpoint_bins, - "viewpoint_name": viewpoint_name, - "viewpoint_chrom": [c["chrom"] for c in viewpoint_coords], - "viewpoint_coords": [ - f'{c["chrom"]}:{c["start"]}-{c["end"]}' for c in viewpoint_coords - ], - "n_cis_interactions": int(n_cis_interactions), - } - - if os.path.exists( - output_prefix - ): # Will append to a prexisting file if one is supplied - append_to_file = True - cooler_fn = f"{output_prefix}::/{viewpoint_name}" - else: - append_to_file = False - cooler_fn = f"{output_prefix.replace('.hdf5', '')}{'.' + suffix if suffix else ''}.hdf5::/{viewpoint_name}" - - cooler.create_cooler( - cooler_fn, - bins=bins, - pixels=pixels, - metadata=metadata, - mode="w" if not append_to_file else "a", - **cooler_kwargs, - ) - - return cooler_fn - - -class GenomicBinner: - """ - Provides a conversion table for converting two sets of bins. - - Attributes: - chromsizes (pd.Series): Series indexed by chromosome name containg chromosome sizes in bp - fragments (pd.DataFrame): DataFrame containing bins to convert to equal genomic intervals - binsize (int): Genomic bin size - min_overlap (float): Minimum degree of intersection to define an overlap. - n_cores (int): Number of cores to use for bin intersection. - - """ - - def __init__( - self, - chromsizes: Union[os.PathLike, pd.DataFrame, pd.Series], - fragments: pd.DataFrame, - binsize: int = 5000, - n_cores: int = 8, - method: Literal["midpoint", "overlap"] = "midpoint", - min_overlap: float = 0.2, - ): - """ - Args: - chromsizes (Union[os.PathLike, pd.DataFrame, pd.Series]): Series indexed by chromosome name containg chromosome sizes in bp - fragments (pd.DataFrame): DataFrame containing bins to convert to equal genomic intervals - binsize (int, optional): Genomic window size. Defaults to 5000. - n_cores (int, optional): Number of cores to use for bin intersection.. Defaults to 8. - min_overlap (float, optional): Minimum degree of intersection to define an overlap.Only used for "overlap" method. Defaults to 0.2. - """ - - if not method in ["midpoint", "overlap"]: - raise ValueError('Method should be either "midpoint" or "overlap"') - - self.method = method - self.chromsizes = self._format_chromsizes(chromsizes) - self.fragments = self._format_fragments(fragments) - self.binsize = binsize - self.min_overlap = min_overlap - - self.bins_genomic = self._get_bins() - self._bin_conversion_table = None - self.n_cores = n_cores - - def _get_bins(self): - return ( - cooler.util.make_bintable(chromsizes=self.chromsizes, binsize=self.binsize) - .reset_index() - .rename(columns={"index": "name"})[["chrom", "start", "end", "name"]] - .assign( - start=lambda df: df["start"].astype(int), - end=lambda df: df["end"].astype(int), - ) - ) - - def _format_chromsizes(self, chromsizes): - - _chromsizes = pd.Series(dtype=np.int64) - if isinstance(chromsizes, str): - _chromsizes = pd.read_csv( - chromsizes, - sep="\t", - header=None, - names=["chrom", "size"], - usecols=[0, 1], - index_col=0, - )["size"].sort_index(key=natsort_key) - - elif isinstance(chromsizes, pd.DataFrame): - if chromsizes.index.astype(str).str.contains("^chr.*"): - _chromsizes = chromsizes.iloc[:, 0] - - elif isinstance(chromsizes, pd.Series): - _chromsizes = chromsizes - - if not _chromsizes.empty: - return _chromsizes - else: - raise ValueError("Chromsizes supplied in the wrong format") - - def _natsort_dataframe(self, df, column): - - df_by_key = {k: df for k, df in df.groupby(column)} - - _df = pd.DataFrame() - for k in natsorted(df_by_key): - if _df is not None: - _df = pd.concat([_df, df_by_key[k]]) - else: - _df = df_by_key[k] - - return _df - - def _format_fragments(self, fragments): - - # Read the fragments - if isinstance(fragments, str): - _fragments = pd.read_csv( - fragments, - sep="\t", - index_col=0, - header=None, - names=["chrom", "start", "end", "name"], - ) - elif isinstance(fragments, pd.DataFrame): - _fragments = fragments - - if not "name" in _fragments.columns: - _fragments = _fragments.reset_index().rename(columns={"index": "name"})[ - ["chrom", "start", "end", "name"] - ] - - # Adjust fragments based on method - if self.method == "midpoint": - lengths = _fragments["end"] - fragments["start"] - midpoint = _fragments["start"] + (lengths // 2) - - _fragments["start"] = midpoint - _fragments["end"] = midpoint + 1 - - return self._natsort_dataframe(_fragments, "chrom") - - def _get_bin_conversion_table(self) -> pd.DataFrame: - - # Split bins by chromosome - bins_genomic_by_chrom = split_intervals_on_chrom(self.bins_genomic) - bins_fragments_by_chrom = split_intervals_on_chrom(self.fragments) - - # Identify shared chromosomes (all should be represented) - shared_chroms = set(bins_fragments_by_chrom) & set(bins_genomic_by_chrom) - - # Perform intersection of bins by chromosome - bins_intersections = Parallel(n_jobs=self.n_cores)( - delayed(intersect_bins)( - bins_fragments_by_chrom[chrom], - bins_genomic_by_chrom[chrom], - loj=True, - sorted=True, - f=self.min_overlap if self.method == "overlap" else 1e-9, - ) - for chrom in natsorted(shared_chroms) - ) - - # Concatenate intersected dataframes and replace labels for clarity - df_bins_intersections = pd.concat(bins_intersections, ignore_index=True).rename( - columns=lambda c: c.replace("_1", "_fragment").replace("_2", "_bin") - ) - - # # Calculate overlap fraction - # df_bins_intersections["overlap_fraction"] = df_bins_intersections["overlap"] / ( - # df_bins_intersections["end_fragment"] - # - df_bins_intersections["start_fragment"] - # ) - - return df_bins_intersections - - @property - def bins(self) -> pd.DataFrame: - """ - Equal genomic bins. - - Returns: - pd.DataFrame: DataFrame in bed format. - """ - return self.bins_genomic - - @property - def bin_conversion_table(self) -> pd.DataFrame: - """ - - Returns: - pd.DataFrame: Conversion table containing coordinates and ids of intersecting bins. - """ - - if self._bin_conversion_table is not None: - return self._bin_conversion_table - else: - self._bin_conversion_table = self._get_bin_conversion_table() - return self._bin_conversion_table - - -class CoolerBinner: - """Bins a cooler file into equal genomic intervals. - - Attributes: - cooler: (cooler.Cooler): Cooler instance to bin. - binner (capcruncher.storeage.GenomicBinner): Binner class to generate bin conversion tables - binsize (int): Genomic bin size - scale_factor (int): Scaling factor for normalising interaction counts. - n_cis_interactions (int): Number of cis interactions with the viewpoint bins. - n_cores (int): Number of cores to use for binning. - - """ - - def __init__( - self, - cooler_group: os.PathLike, - binsize: int = None, - n_cores: int = 8, - binner: GenomicBinner = None, - ): - """ - Args: - cooler_group (os.PathLike): Path to cooler to bin. A cooler group can be specified with FN_PATH.hdf5::/PATH_TO_GROUP. - binsize (int, optional): Genomic binsize. Defaults to None. - n_cores (int, optional): Number of cores to use for binning. Defaults to 8. - binner (capcruncher.storage.GenomicBinner, optional): Binner object to produce conversion tables. - Can be initialised and provided so that binning is not repeated. - Defaults to None. - """ - - if isinstance(cooler_group, str): - self.cooler = cooler.Cooler(cooler_group) - elif isinstance(cooler_group, cooler.Cooler): - self.cooler = cooler_group - else: - raise ValueError("cooler_group must be a path to a cooler file or a cooler object") - - self.bins_fragments = self.cooler.bins()[:] - - self.binner = binner if binner is not None else GenomicBinner( - chromsizes=self.cooler.chromsizes, - fragments=self.bins_fragments, - binsize=binsize, - ) - - self.binsize = self.binner.binsize - self.n_cores = n_cores - - self._bin_conversion_table = None - self._pixel_conversion_table = None - self._pixels = None - - self.n_cis_interactions = self.cooler.info["metadata"]["n_cis_interactions"] - - def _get_pixel_conversion_table(self): - - pixels = self.cooler.pixels()[:] - pixels_conv = ( - pixels.merge( - self.bin_conversion_table[["name_bin", "name_fragment"]].add_suffix( - "_1" - ), - left_on="bin1_id", - right_on="name_fragment_1", - ) - .merge( - self.bin_conversion_table[["name_bin", "name_fragment"]].add_suffix( - "_2" - ), - left_on="bin2_id", - right_on="name_fragment_2", - ) - .drop(columns=["name_fragment_1", "name_fragment_2"]) - ) - - return pixels_conv - - def _get_pixels(self): - - df_pixels = ( - self.pixel_conversion_table.groupby(["name_bin_1", "name_bin_2"])["count"] - .sum() - .to_frame() - .reset_index() - ) - - df_pixels.columns = ["bin1_id", "bin2_id", "count"] - - # Swap bins over if not correct - df_pixels["bin1_id_corrected"] = np.where( - df_pixels["bin1_id"] > df_pixels["bin2_id"], - df_pixels["bin2_id"], - df_pixels["bin1_id"], - ) - df_pixels["bin2_id_corrected"] = np.where( - df_pixels["bin1_id"] > df_pixels["bin2_id"], - df_pixels["bin1_id"], - df_pixels["bin2_id"], - ) - # df_pixels = df_pixels.loc[lambda df: df["bin1_id"] != df["bin2_id"]] - df_pixels = df_pixels.loc[ - :, ["bin1_id_corrected", "bin2_id_corrected", "count"] - ].rename(columns=lambda col: col.replace("_corrected", "")) - - return df_pixels - - @property - def bins(self) -> pd.DataFrame: - """ - Returns: - pd.DataFrame: Even genomic bins of a specified binsize. - """ - return self.binner.bins - - @property - def bin_conversion_table(self) -> pd.DataFrame: - """ - Returns: - pd.DataFrame: Conversion table containing coordinates and ids of intersecting bins. - """ - return self.binner.bin_conversion_table - - @property - def viewpoint_bins(self): - """ - Returns: - pd.DataFrame: viewpoint bins converted to the new even genomic bin format. - """ - viewpoint_frags = self.cooler.info["metadata"]["viewpoint_bins"] - return self.bin_conversion_table.loc[ - lambda df: df["name_fragment"].isin(viewpoint_frags) - ]["name_bin"].values - - @property - def pixel_conversion_table(self): - """ - Returns: - pd.DataFrame: Conversion table to convert old binning scheme to the new. - """ - if self._pixel_conversion_table is not None: - return self._pixel_conversion_table - else: - self._pixel_conversion_table = self._get_pixel_conversion_table() - return self._pixel_conversion_table - - @property - def pixels(self): - """ - Returns: - pd.DataFrame: Pixels (interaction counts) converted to the new binning scheme. - """ - if self._pixels is not None: - return self._pixels - else: - self._pixels = self._get_pixels() - return self._pixels - - def normalise( - self, - n_fragment_correction: bool = True, - n_interaction_correction: bool = True, - scale_factor: int = 1e6, - ): - """Normalises pixels (interactions). - - Normalises pixels according to the number of restriction fragments per bin and the number of cis interactions. - If both normalisation options are selected, will also provide a dual normalised column. - - Args: - n_fragment_correction (bool, optional): Updates the pixels DataFrame with counts corrected for the number of - restriction fragments per bin. Defaults to True. - n_interaction_correction (bool, optional): Updates the pixels DataFrame with counts corrected for the number of - cis interactions. Defaults to True. - scale_factor (int, optional): Scaling factor for n_interaction_correction. Defaults to 1e6. - """ - - if n_fragment_correction: - df_nrf = ( - self.bin_conversion_table.groupby("name_bin").size().to_frame("n_rf") - ) - pixels = self.pixels.merge( - df_nrf.add_prefix("bin1_"), left_on="bin1_id", right_index=True - ).merge(df_nrf.add_prefix("bin2_"), left_on="bin2_id", right_index=True) - - self.pixels["count_n_rf_norm"] = self.pixels["count"] / ( - pixels["bin1_n_rf"] * pixels["bin2_n_rf"] - ) - - if n_interaction_correction: - self.pixels["count_n_interactions_norm"] = ( - self.pixels["count"] / self.n_cis_interactions - ) * scale_factor - - if n_fragment_correction and n_interaction_correction: - - self.pixels["count_n_rf_n_interactions_norm"] = ( - self.pixels["count_n_rf_norm"] / self.n_cis_interactions - ) * scale_factor - - def to_cooler(self, store: os.PathLike): - - metadata = {**self.cooler.info["metadata"]} - metadata["viewpoint_bins"] = [int(x) for x in self.viewpoint_bins] - - cooler_fn = f"{store}::/{metadata['viewpoint_name']}/resolutions/{self.binsize}" - - cooler.create_cooler( - cooler_fn, - bins=self.bins, - pixels=self.pixels, - metadata=metadata, - mode="w" if not os.path.exists(store) else "a", - columns=self.pixels.columns[2:], - ) - - return cooler_fn - - -def link_common_cooler_tables(clr: os.PathLike): - """Reduces cooler storage space by linking "bins" table. - - All of the cooler "bins" tables containing the genomic coordinates of each bin - are identical for all cooler files of the same resoultion. As cooler.create_cooler - generates a new bins table for each cooler, this leads to a high degree of duplication. - - This function hard links the bins tables for a given resolution to reduce the degree of duplication. - - Args: - clr (os.PathLike): Path to cooler hdf5 produced by the merge command. - """ - - logging.info("Making links to common cooler tables to conserve disk space") - - with h5py.File(clr, "a") as f: - - # Get all viewpoints stored - viewpoints = sorted(list(f.keys())) - - # Get all resolutions stored - try: - resolutions = [res for res in f[viewpoints[0]]["resolutions"]] - except (KeyError, IndexError): - resolutions = None - - for viewpoint in viewpoints[1:]: - - try: - # Delete currenly stored bins group and replace with link to first viewpoint "bins" group - del f[viewpoint]["bins"] - f[viewpoint]["bins"] = f[viewpoints[0]]["bins"] - - # Delete chroms table and replace with link to the first "chroms" group - del f[viewpoint]["chroms"] - f[viewpoint]["chroms"] = f[viewpoints[0]]["chroms"] - except KeyError: - pass - - - # Repeat for resolutions i.e. binned coolers - if resolutions: - for resolution in resolutions: - del f[viewpoint]["resolutions"][resolution]["bins"] - f[viewpoint]["resolutions"][resolution]["bins"] = f[viewpoints[0]]["resolutions"][resolution]["bins"] - - del f[viewpoint]["resolutions"][resolution]["chroms"] - f[viewpoint]["resolutions"][resolution]["chroms"] = f[viewpoints[0]]["resolutions"][resolution]["chroms"] diff --git a/capcruncher/utils.py b/capcruncher/utils.py index a969e9a4..dae0fb4f 100644 --- a/capcruncher/utils.py +++ b/capcruncher/utils.py @@ -1,26 +1,26 @@ -import logging import os import pickle import re -import time -from datetime import timedelta from functools import wraps -from typing import Callable, Generator, Iterable, List, Literal, Tuple, Union -import numpy as np +from typing import Generator, Iterable, Tuple, Union, Callable import pandas as pd import pybedtools import ujson import xxhash from pybedtools import BedTool +import pyranges as pr +import pysam def read_dataframes(filenames: Iterable, **kwargs): + from loguru import logger + dframes = [] for fn in filenames: try: df = pd.read_csv(fn, **kwargs) except (pd.errors.EmptyDataError): - logging.warning(f"{fn} is empty") + logger.warning(f"{fn} is empty") if not df.empty: dframes.append(df) @@ -51,10 +51,7 @@ def is_on(param: str) -> bool: - 1 """ values = ["true", "t", "on", "yes", "y", "1"] - if str(param).lower() in values: - return True - else: - return False + return str(param).lower() in values def is_off(param: str): @@ -89,6 +86,7 @@ def get_human_readable_number_of_bp(bp: int) -> str: def is_valid_bed(bed: Union[str, BedTool], verbose=True) -> bool: + from loguru import logger """Returns true if bed file can be opened and has at least 3 columns""" try: @@ -99,15 +97,15 @@ def is_valid_bed(bed: Union[str, BedTool], verbose=True) -> bool: except Exception as e: if isinstance(e, FileNotFoundError): - logging.debug(f"Bed file not found") + logger.warning(f"Bed file: {bed} not found") elif isinstance(e, IndexError): - logging.debug( - f"Wrong number of fields detected check separator/ number of columns" + logger.warning( + "Wrong number of fields detected check separator/ number of columns" ) else: - logging.debug(f"Exception raised {e}") + logger.warning(f"Exception raised {e}") def bed_has_name(bed: Union[str, BedTool]) -> bool: @@ -129,45 +127,6 @@ def bed_has_duplicate_names(bed: Union[str, BedTool]) -> bool: return True -def get_re_site(recognition_site: str = None) -> str: - - """ - Obtains the recogniton sequence for a supplied restriction enzyme or correctly - formats a supplied recognition sequence. - - Args: - cut_sequence - DNA sequence to use for fasta digestion e.g. "GATC" - restriction_enzyme - Name of restriction enzyme e.g. DpnII (case insensitive) - - Returns: - recognition sequence e.g. "GATC" - - Raises: - ValueError: Error if restriction_enzyme is not in known enzymes - - """ - - known_enzymes = { - "dpnii": "GATC", - "mboi": "GATC", - "hindiii": "AAGCTT", - "ecori": "GAATTC", - "nlaiii": "CATG", - } - - if re.match(r"[GgAaTtCc]+", recognition_site): # matches a DNA sequence - cutsite = recognition_site.upper() # Just uppercase convert and return - - elif recognition_site.lower() in known_enzymes: - cutsite = known_enzymes[recognition_site.lower()] - - else: - logging.error("No restriction site or recognised enzyme provided") - raise ValueError("No restriction site or recognised enzyme provided") - - return cutsite - - def hash_column(col: Iterable, hash_type=64) -> list: """ Convinience function to perform hashing using xxhash on an iterable. @@ -275,6 +234,9 @@ def get_timing(task_name=None) -> Callable: """Decorator: Gets the time taken by the wrapped function """ + import time + from datetime import timedelta + from loguru import logger def wrapper(f): @wraps(f) @@ -284,7 +246,7 @@ def wrapped(*args, **kwargs): time_end = time.perf_counter() time_taken = timedelta(seconds=(time_end - time_start)) - logging.info(f"Completed {task_name} in {time_taken} (hh:mm:ss.ms)") + logger.info(f"Completed {task_name} in {time_taken} (hh:mm:ss.ms)") return result return wrapped @@ -316,6 +278,7 @@ def categorise_tracks(ser: pd.Series) -> list: mapping = { "raw": "Replicates", "normalised": "Replicates_Scaled", + "norm": "Replicates_Scaled", "summary": "Samples_Summarised", "subtraction": "Samples_Compared", } @@ -328,8 +291,76 @@ def categorise_tracks(ser: pd.Series) -> list: return categories -def convert_bed_to_dataframe(bed: Union[str, BedTool, pd.DataFrame]) -> pd.DataFrame: +def convert_bed_to_pr( + bed: Union[ + str, + pybedtools.BedTool, + pd.DataFrame, + pr.PyRanges, + "ray.ObjectRef", # noqa: F821 + ], + ignore_ray_objrefs=True, +) -> pr.PyRanges: + """Converts a bed file to a PyRanges object. + Args: + bed (Union[str, pybedtools.BedTool, pd.DataFrame, pr.PyRanges, ray.ObjectRef]): Bed file to convert. + ignore_ray_objrefs (bool, optional): If False, allows for ray object references to be used instead python objects. Defaults to True. + Returns: + pr.PyRanges: PyRanges object. + """ + + from loguru import logger + import ray + + if isinstance(bed, str): + converted = pr.read_bed(bed) + + elif isinstance(bed, pybedtools.BedTool): + converted = ( + bed.to_dataframe() + .rename( + columns={ + "chrom": "Chromosome", + "start": "Start", + "end": "End", + "name": "Name", + } + ) + .pipe(pr.PyRanges) + ) + + elif isinstance(bed, pr.PyRanges): + converted = bed + + elif isinstance(bed, pd.DataFrame): + converted = bed.rename( + columns={ + "chrom": "Chromosome", + "start": "Start", + "end": "End", + "name": "Name", + } + ).pipe(pr.PyRanges) + + elif isinstance(bed, ray.ObjectRef): + + if ignore_ray_objrefs: + logger.warning("Assuming ObjectRef is a PyRanges") + converted = bed + else: + bed = ray.get(bed) + converted = convert_bed_to_pr(bed) + + return converted + + +def convert_bed_to_dataframe( + bed: Union[str, BedTool, pd.DataFrame, "ray.ObjectRef", pr.PyRanges], # noqa: F821 + ignore_ray_objrefs=False, +) -> pd.DataFrame: """Converts a bed like object (including paths to bed files) to a pd.DataFrame""" + from loguru import logger + import ray if isinstance(bed, str): bed_conv = BedTool(bed).to_dataframe() @@ -340,14 +371,41 @@ def convert_bed_to_dataframe(bed: Union[str, BedTool, pd.DataFrame]) -> pd.DataF elif isinstance(bed, pd.DataFrame): bed_conv = bed + elif isinstance(bed, pr.PyRanges): + bed_conv = bed.as_df() + + elif isinstance(bed, ray.ObjectRef): + if ignore_ray_objrefs: + logger.warning("Assuming ObjectRef is a PyRanges") + bed_conv = bed + else: + bed = ray.get(bed) + bed_conv = convert_bed_to_dataframe(bed) + return bed_conv +def is_tabix(file: str): + from loguru import logger + + _is_tabix = False + + try: + tbx = pysam.TabixFile(file) + _chroms = tbx.contigs + _is_tabix = True + + except (OSError) as e: + logger.warn(e) + + return _is_tabix + + def format_coordinates(coordinates: Union[str, os.PathLike]) -> BedTool: """Converts coordinates supplied in string format or a .bed file to a BedTool. Args: - coordinates (Union[str, os.PathLike]): Coordinates in the form chr:start-end or a path. + coordinates (Union[str, os.PathLike]): Coordinates in the form chr:start-end/path. Raises: ValueError: Inputs must be supplied in the correct format. @@ -386,7 +444,7 @@ def format_coordinates(coordinates: Union[str, os.PathLike]) -> BedTool: else: raise ValueError( - """Coordinates not provided in the correct format. Provide coordinates in the form chr[NUMBER]:[START]-[END] or a .bed file""" + """Provide coordinates in the form chr[NUMBER]:[START]-[END]/BED file""" ) return bt @@ -460,6 +518,7 @@ def get_file_type(fn: os.PathLike) -> str: Returns: str: File type """ + from loguru import logger file_types = { "hdf5": "hdf5", @@ -477,60 +536,10 @@ def get_file_type(fn: os.PathLike) -> str: try: return file_types[ext] except KeyError as e: - logging.debug(f"File extension {ext} is not supported") + logger.debug(f"File extension {ext} is not supported") raise e -def get_categories_from_hdf5_column( - path: os.PathLike, - key: str, - column: str, - null_value: Union[Literal["."], int, float] = ".", -) -> List[str]: - """Extracts all categories from pytables table column. - - Args: - path (os.PathLike): Path to hdf5 store - key (str): Table name - column (str): Column name from which to extract categories - null_value (Union[Literal[, optional): [description]. Defaults to ".". - - Returns: - List[str]: Category names - """ - - df_test = pd.read_hdf(path, key, start=0, stop=10) - - # If its a category get from the cat codes - if isinstance(df_test.dtypes[column], pd.CategoricalDtype): - return [ - cat - for cat in df_test[column].cat.categories.values - if not cat == null_value - ] - - # Try to extract from the metadata - try: - with pd.HDFStore(path, "r") as store: - # s = store.get_storer(key) - # values = getattr(s.attrs, column) - values = store[f"{key}_category_metadata"][column].unique() - return values - except AttributeError: - # Just determine from sampling all of the data (HIGHLY inefficient) - import dask.dataframe as dd - import dask.distributed - - with dask.distributed.Client(processes=True) as client: - values = [ - x - for x in dd.read_hdf(path, key, columns=column)[column] - .unique() - .compute() - ] - return values - - def get_cooler_uri(store: os.PathLike, viewpoint: str, resolution: Union[str, int]): cooler_fragment = r"(?P.*?).hdf5::/(?!.*/resolutions/)(?P.*?)$" diff --git a/capcruncher_pipeline_alpha_test.html b/capcruncher_pipeline_alpha_test.html new file mode 100644 index 00000000..41409780 --- /dev/null +++ b/capcruncher_pipeline_alpha_test.html @@ -0,0 +1,4007 @@ + + + + + + + + + + + +CapCruncher Pipeline Alpha Testing + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+

CapCruncher Pipeline Alpha Testing

+
+ + + +
+ +
+
Author
+
+

Alastair Smith

+
+
+ +
+
Published
+
+

June 29, 2023

+
+
+ + +
+ + +
+ +
+

Current Status

+

This is still in alpha testing so there are still some bugs and missing features.

+
+

Known issues

+
    +
  • BUG: Out of jobs ready to be started, but not all files built yet. Please check https://github.com/snakemake/snakemake/issues/823 for more information - Best solution is to just re-run the pipeline.

  • +
  • BUG: The pipeline will fail if there are no reads left after filtering. This is a known issue and will be fixed in the next release.

  • +
+
+
+
+

Installation - Alpha Testing

+

There are two main ways currently to run the new pipeline:

+
    +
  1. Install into a conda environment and run directly from this.

  2. +
  3. Install into a very minimal conda env and then run the pipeline using a singularity container.

  4. +
+
+

0. Prerequisites

+
+

0.1 Download the GitHub repo development branch

+
git clone https://github.com/sims-lab/CapCruncher.git
+cd CapCruncher
+git checkout develop
+
+
+ +
+

B. Minimal conda environment

+
mamba create -n cc pip "python==3.10"
+conda activate cc
+
+
+

Install CapCruncher

+

Note: Whilst not essential I would install the stats, plotting and experimental dependecies as well as the core package. This is what I do for my own testing.

+
pip install .[stats,plotting,experimental]
+
+
+

(Optional) Set-up a Snakemake profile

+

This is not essential but it will make running the pipeline much easier by submitting jobs to the cluster automatically and using pre-set parameters.

+

Note: Cookiecutter is required for this step. This can be installed using pip install cookiecutter.

+
+

For SLURM based clusters:

+
# create config directory that snakemake searches for profiles (or use something else)
+profile_dir="${HOME}/.config/snakemake"
+mkdir -p "$profile_dir"
+# use cookiecutter to create the profile in the config directory
+template="gh:Snakemake-Profiles/slurm"
+cookiecutter --output-dir "$profile_dir" "$template"
+
+
+

For SGE based clusters:

+

Note: I have no way of testing this so it may not work.

+
mkdir -p ~/.config/snakemake
+cd ~/.config/snakemake
+cookiecutter https://github.com/Snakemake-Profiles/sge.git
+
+
+

My current profiles look like this:

+
/home/a/asmith/.config/snakemake/slurm/
+├── config.yaml
+├── CookieCutter.py
+├── __pycache__
+│   ├── CookieCutter.cpython-310.pyc
+│   ├── CookieCutter.cpython-311.pyc
+│   ├── slurm_utils.cpython-310.pyc
+│   └── slurm_utils.cpython-311.pyc
+├── settings.json
+├── slurm-jobscript.sh
+├── slurm-sidecar.py
+├── slurm-status.py
+├── slurm-submit.py
+└── slurm_utils.py
+

settings.json:

+
{
+    "SBATCH_DEFAULTS": "--partition=short --time=0-01:00:00 --mem=3G",
+    "CLUSTER_NAME": "",
+    "CLUSTER_CONFIG": ""
+}
+

config.yaml:

+

+cluster-sidecar: "slurm-sidecar.py"
+cluster-cancel: "scancel"
+restart-times: "0"
+jobscript: "slurm-jobscript.sh"
+cluster: "slurm-submit.py"
+cluster-status: "slurm-status.py"
+max-jobs-per-second: "10"
+max-status-checks-per-second: "10"
+local-cores: 1
+latency-wait: "5"
+use-conda: "True"
+use-singularity: "False"
+singularity-args: -B /ceph  -B /databank
+jobs: "50"
+printshellcmds: "True"
+retries: 3
+
+# Example resource configuration
+# default-resources:
+#   - runtime=100
+#   - mem_mb=6000
+#   - disk_mb=1000000
+# # set-threads: map rule names to threads
+# set-threads:
+#   - single_core_rule=1
+#   - multi_core_rule=10
+# # set-resources: map rule names to resources in general
+# set-resources:
+#   - high_memory_rule:mem_mb=12000
+#   - long_running_rule:runtime=1200
+

Note: The singularity-args are required to mount the data directories into the container. e.g.

+
singularity-args: -B /ceph  -B /databank
+

Gives the container access to the /ceph and /databank directories on the cluster. The current working directory is also mounted into the container by default. You can add additional directories by adding more -B flags. Obviously this will be different for each cluster so you’ll need your own defaults.

+
+
+
+
+

Pipeline Configuration

+
+

1. Generate a config file

+

The pipeline is configured using a YAML file. This file is passed to the pipeline using the --config flag. The pipeline will look for a file called capcruncher_config.yml in the current working directory by default. This can be overridden using the --config flag.

+

I’ve added in a utility to generate a custom config via command line prompts. I would highly recommend using this as the testing is performed using this method and it will ensure that the config is valid.

+
capcruncher pipeline-config
+

Just follow the prompts and it will generate a config file for you. Any errors can be corrected by editing the config file directly.

+

This will generate a config file called capcruncher_config.yml in a folder with the current date, project name and assay type in the current working directory e.g. 2023-05-23-rb-tiled-c.

+
+
+

(Optional) 2. Generate a design file

+

Note: This is completely optional as the pipeline will do some basic sample name comparison to generate a basic design file. However, this will not be as accurate as a manually generated design file.

+

Note: I will add an option to automatically generate a design file but this is not yet implemented.

+

A design file is optional and only used for comparisons between Capture-C and Tri-C data. It is a tab delimited file with the following columns:

+
    +
  • sample: The name of the FASTQ file (without the _R1.fastq.gz or _2.fastq.gz suffix)
  • +
  • condition: The Group that the sample belongs to.
  • +
+

Provide the path to this file in the config file under the design key.

+
+
+

3. Ensure that the FASTQ files are in the working directory

+

The pipeline will look for FASTQ files in the current working directory. It will look for files with the following suffixes:

+
    +
  • _R1.fastq.gz
  • +
  • _2.fastq.gz
  • +
+

Eventually a design file will also be able to be used. This is not yet implemented.

+

Note: Symlinks are also supported. I would recommend using the absolute path to the FASTQ files to avoid any issues.

+
+
+
+

Running the pipeline

+

The pipeline is entirely Snakemake based now so you can see their documentation and add any additional flags that you want to use.

+
+

A. Running the pipeline locally (within the conda environment exclusively)

+
capcruncher pipeline -c <NUMBER OF CORES>
+
+
+

B. Running the pipeline on a cluster (within the conda environment exclusively)

+
capcruncher pipeline -c <NUMBER OF CORES> --profile <PROFILE NAME>
+

Example:

+
capcruncher pipeline -c 10 --profile slurm
+
+
+

C. Running the pipeline using the singularity container

+

Assuming the profile is set-up correctly this is as simple as:

+
capcruncher pipeline -c <NUMBER OF CORES> --profile <PROFILE NAME> --use-singularity
+

You can also use the container locally by adding the --use-singularity flag to the command. e.g.

+

Note: You need to add the custom singularity arguments to the command as well to bind the data directories to the container.

+
capcruncher pipeline -c 10 --use-singularity --singularity-args " -B /ceph -B /databank "
+
+
+
+

Output

+
+

1. Example output

+

The pipeline will generate a folder called capcruncher_output in the current working directory. This will contain the following files:

+

** Note **: This is subject to change as the pipeline is still in development.

+
── interim
+│   ├── annotate
+│   │   └── exclude.bed
+│   ├── comparisons
+│   │   ├── summaries_and_subtractions
+│   │   │   ├── A-B.mean-subtraction.Slc25A37.bedgraph
+│   │   │   ├── A.mean-summary.Slc25A37.bedgraph
+│   │   │   ├── B-A.mean-subtraction.Slc25A37.bedgraph
+│   │   │   └── B.mean-summary.Slc25A37.bedgraph
+│   │   ├── summaries_and_subtractionsA-B.mean-subtraction.Slc25A37.bedgraph
+│   │   ├── summaries_and_subtractionsA.mean-summary.Slc25A37.bedgraph
+│   │   ├── summaries_and_subtractionsB-A.mean-subtraction.Slc25A37.bedgraph
+│   │   └── summaries_and_subtractionsB.mean-summary.Slc25A37.bedgraph
+│   ├── fastq
+│   │   ├── deduplicated
+│   │   │   ├── SAMPLE-A_REP1
+│   │   │   │   ├── SAMPLE-A_REP1_part0_1.fastq.gz
+│   │   │   │   └── SAMPLE-A_REP1_part0_2.fastq.gz
+│   │   │   ├── SAMPLE-A_REP2
+│   │   │   │   ├── SAMPLE-A_REP2_part0_1.fastq.gz
+│   │   │   │   └── SAMPLE-A_REP2_part0_2.fastq.gz
+│   │   │   ├── SAMPLE-B_REP1
+│   │   │   │   ├── SAMPLE-B_REP1_part0_1.fastq.gz
+│   │   │   │   └── SAMPLE-B_REP1_part0_2.fastq.gz
+│   │   │   └── SAMPLE-B_REP2
+│   │   │       ├── SAMPLE-B_REP2_part0_1.fastq.gz
+│   │   │       └── SAMPLE-B_REP2_part0_2.fastq.gz
+│   │   ├── flashed
+│   │   │   ├── SAMPLE-A_REP1
+│   │   │   │   ├── SAMPLE-A_REP1_part0.extendedFrags.fastq.gz
+│   │   │   │   ├── SAMPLE-A_REP1_part0.hist
+│   │   │   │   ├── SAMPLE-A_REP1_part0.histogram
+│   │   │   │   ├── SAMPLE-A_REP1_part0.notCombined_1.fastq.gz
+│   │   │   │   └── SAMPLE-A_REP1_part0.notCombined_2.fastq.gz
+│   │   │   ├── SAMPLE-A_REP2
+│   │   │   │   ├── SAMPLE-A_REP2_part0.extendedFrags.fastq.gz
+│   │   │   │   ├── SAMPLE-A_REP2_part0.hist
+│   │   │   │   ├── SAMPLE-A_REP2_part0.histogram
+│   │   │   │   ├── SAMPLE-A_REP2_part0.notCombined_1.fastq.gz
+│   │   │   │   └── SAMPLE-A_REP2_part0.notCombined_2.fastq.gz
+│   │   │   ├── SAMPLE-B_REP1
+│   │   │   │   ├── SAMPLE-B_REP1_part0.extendedFrags.fastq.gz
+│   │   │   │   ├── SAMPLE-B_REP1_part0.hist
+│   │   │   │   ├── SAMPLE-B_REP1_part0.histogram
+│   │   │   │   ├── SAMPLE-B_REP1_part0.notCombined_1.fastq.gz
+│   │   │   │   └── SAMPLE-B_REP1_part0.notCombined_2.fastq.gz
+│   │   │   └── SAMPLE-B_REP2
+│   │   │       ├── SAMPLE-B_REP2_part0.extendedFrags.fastq.gz
+│   │   │       ├── SAMPLE-B_REP2_part0.hist
+│   │   │       ├── SAMPLE-B_REP2_part0.histogram
+│   │   │       ├── SAMPLE-B_REP2_part0.notCombined_1.fastq.gz
+│   │   │       └── SAMPLE-B_REP2_part0.notCombined_2.fastq.gz
+│   │   ├── rebalanced
+│   │   │   ├── SAMPLE-A_REP1
+│   │   │   │   ├── flashed
+│   │   │   │   │   └── SAMPLE-A_REP1_part0_flashed_1.fastq.gz
+│   │   │   │   └── pe
+│   │   │   │       ├── SAMPLE-A_REP1_part0_pe_1.fastq.gz
+│   │   │   │       └── SAMPLE-A_REP1_part0_pe_2.fastq.gz
+│   │   │   ├── SAMPLE-A_REP2
+│   │   │   │   ├── flashed
+│   │   │   │   │   └── SAMPLE-A_REP2_part0_flashed_1.fastq.gz
+│   │   │   │   └── pe
+│   │   │   │       ├── SAMPLE-A_REP2_part0_pe_1.fastq.gz
+│   │   │   │       └── SAMPLE-A_REP2_part0_pe_2.fastq.gz
+│   │   │   ├── SAMPLE-B_REP1
+│   │   │   │   ├── flashed
+│   │   │   │   │   └── SAMPLE-B_REP1_part0_flashed_1.fastq.gz
+│   │   │   │   └── pe
+│   │   │   │       ├── SAMPLE-B_REP1_part0_pe_1.fastq.gz
+│   │   │   │       └── SAMPLE-B_REP1_part0_pe_2.fastq.gz
+│   │   │   └── SAMPLE-B_REP2
+│   │   │       ├── flashed
+│   │   │       │   └── SAMPLE-B_REP2_part0_flashed_1.fastq.gz
+│   │   │       └── pe
+│   │   │           ├── SAMPLE-B_REP2_part0_pe_1.fastq.gz
+│   │   │           └── SAMPLE-B_REP2_part0_pe_2.fastq.gz
+│   │   ├── split
+│   │   │   ├── SAMPLE-A_REP1
+│   │   │   │   ├── SAMPLE-A_REP1_part0_1.fastq.gz
+│   │   │   │   └── SAMPLE-A_REP1_part0_2.fastq.gz
+│   │   │   ├── SAMPLE-A_REP2
+│   │   │   │   ├── SAMPLE-A_REP2_part0_1.fastq.gz
+│   │   │   │   └── SAMPLE-A_REP2_part0_2.fastq.gz
+│   │   │   ├── SAMPLE-B_REP1
+│   │   │   │   ├── SAMPLE-B_REP1_part0_1.fastq.gz
+│   │   │   │   └── SAMPLE-B_REP1_part0_2.fastq.gz
+│   │   │   └── SAMPLE-B_REP2
+│   │   │       ├── SAMPLE-B_REP2_part0_1.fastq.gz
+│   │   │       └── SAMPLE-B_REP2_part0_2.fastq.gz
+│   │   └── trimmed
+│   │       ├── SAMPLE-A_REP1
+│   │       │   ├── SAMPLE-A_REP1_part0_1.fastq.gz_trimming_report.txt
+│   │       │   └── SAMPLE-A_REP1_part0_2.fastq.gz_trimming_report.txt
+│   │       ├── SAMPLE-A_REP2
+│   │       │   ├── SAMPLE-A_REP2_part0_1.fastq.gz_trimming_report.txt
+│   │       │   └── SAMPLE-A_REP2_part0_2.fastq.gz_trimming_report.txt
+│   │       ├── SAMPLE-B_REP1
+│   │       │   ├── SAMPLE-B_REP1_part0_1.fastq.gz_trimming_report.txt
+│   │       │   └── SAMPLE-B_REP1_part0_2.fastq.gz_trimming_report.txt
+│   │       └── SAMPLE-B_REP2
+│   │           ├── SAMPLE-B_REP2_part0_1.fastq.gz_trimming_report.txt
+│   │           └── SAMPLE-B_REP2_part0_2.fastq.gz_trimming_report.txt
+│   ├── filtering
+│   │   └── repartitioned
+│   │       ├── SAMPLE-A_REP1
+│   │       │   ├── flashed
+│   │       │   │   └── SAMPLE-A_REP1_part0_flashed.slices.parquet
+│   │       │   └── pe
+│   │       │       └── SAMPLE-A_REP1_part0_pe.slices.parquet
+│   │       ├── SAMPLE-A_REP2
+│   │       │   ├── flashed
+│   │       │   │   └── SAMPLE-A_REP2_part0_flashed.slices.parquet
+│   │       │   └── pe
+│   │       │       └── SAMPLE-A_REP2_part0_pe.slices.parquet
+│   │       ├── SAMPLE-B_REP1
+│   │       │   ├── flashed
+│   │       │   │   └── SAMPLE-B_REP1_part0_flashed.slices.parquet
+│   │       │   └── pe
+│   │       │       └── SAMPLE-B_REP1_part0_pe.slices.parquet
+│   │       └── SAMPLE-B_REP2
+│   │           ├── flashed
+│   │           │   └── SAMPLE-B_REP2_part0_flashed.slices.parquet
+│   │           └── pe
+│   │               └── SAMPLE-B_REP2_part0_pe.slices.parquet
+│   ├── pileups
+│   │   └── bedgraphs
+│   │       ├── SAMPLE-A_REP1
+│   │       │   ├── norm
+│   │       │   │   └── SAMPLE-A_REP1_Slc25A37.bedgraph
+│   │       │   └── raw
+│   │       │       └── SAMPLE-A_REP1_Slc25A37.bedgraph
+│   │       ├── SAMPLE-A_REP2
+│   │       │   ├── norm
+│   │       │   │   └── SAMPLE-A_REP2_Slc25A37.bedgraph
+│   │       │   └── raw
+│   │       │       └── SAMPLE-A_REP2_Slc25A37.bedgraph
+│   │       ├── SAMPLE-B_REP1
+│   │       │   ├── norm
+│   │       │   │   └── SAMPLE-B_REP1_Slc25A37.bedgraph
+│   │       │   └── raw
+│   │       │       └── SAMPLE-B_REP1_Slc25A37.bedgraph
+│   │       └── SAMPLE-B_REP2
+│   │           ├── norm
+│   │           │   └── SAMPLE-B_REP2_Slc25A37.bedgraph
+│   │           └── raw
+│   │               └── SAMPLE-B_REP2_Slc25A37.bedgraph
+│   ├── qc
+│   │   ├── alignment_raw
+│   │   │   ├── SAMPLE-A_REP1.txt
+│   │   │   ├── SAMPLE-A_REP2.txt
+│   │   │   ├── SAMPLE-B_REP1.txt
+│   │   │   └── SAMPLE-B_REP2.txt
+│   │   └── fastqc
+│   │       ├── SAMPLE-A_REP1_1_fastqc.html
+│   │       ├── SAMPLE-A_REP1_1_fastqc.zip
+│   │       ├── SAMPLE-A_REP1_2_fastqc.html
+│   │       ├── SAMPLE-A_REP1_2_fastqc.zip
+│   │       ├── SAMPLE-A_REP2_1_fastqc.html
+│   │       ├── SAMPLE-A_REP2_1_fastqc.zip
+│   │       ├── SAMPLE-A_REP2_2_fastqc.html
+│   │       ├── SAMPLE-A_REP2_2_fastqc.zip
+│   │       ├── SAMPLE-B_REP1_1_fastqc.html
+│   │       ├── SAMPLE-B_REP1_1_fastqc.zip
+│   │       ├── SAMPLE-B_REP1_2_fastqc.html
+│   │       ├── SAMPLE-B_REP1_2_fastqc.zip
+│   │       ├── SAMPLE-B_REP2_1_fastqc.html
+│   │       ├── SAMPLE-B_REP2_1_fastqc.zip
+│   │       ├── SAMPLE-B_REP2_2_fastqc.html
+│   │       └── SAMPLE-B_REP2_2_fastqc.zip
+│   └── statistics
+│       ├── cis_and_trans_reporters
+│       │   ├── cis_and_trans_reporters.csv
+│       │   └── data
+│       │       ├── SAMPLE-A_REP1.reporter.stats.csv
+│       │       ├── SAMPLE-A_REP2.reporter.stats.csv
+│       │       ├── SAMPLE-B_REP1.reporter.stats.csv
+│       │       └── SAMPLE-B_REP2.reporter.stats.csv
+│       ├── deduplication
+│       │   ├── data
+│       │   │   ├── SAMPLE-A_REP1
+│       │   │   ├── SAMPLE-A_REP1.deduplication.csv
+│       │   │   ├── SAMPLE-A_REP2
+│       │   │   ├── SAMPLE-A_REP2.deduplication.csv
+│       │   │   ├── SAMPLE-B_REP1
+│       │   │   ├── SAMPLE-B_REP1.deduplication.csv
+│       │   │   ├── SAMPLE-B_REP2
+│       │   │   └── SAMPLE-B_REP2.deduplication.csv
+│       │   └── fastq_deduplication.csv
+│       ├── deduplication_by_coordinate
+│       │   ├── alignment_deduplication.csv
+│       │   └── data
+│       │       ├── SAMPLE-A_REP1_flashed.read.stats.csv
+│       │       ├── SAMPLE-A_REP1_pe.read.stats.csv
+│       │       ├── SAMPLE-A_REP2_flashed.read.stats.csv
+│       │       ├── SAMPLE-A_REP2_pe.read.stats.csv
+│       │       ├── SAMPLE-B_REP1_flashed.read.stats.csv
+│       │       ├── SAMPLE-B_REP1_pe.read.stats.csv
+│       │       ├── SAMPLE-B_REP2_flashed.read.stats.csv
+│       │       └── SAMPLE-B_REP2_pe.read.stats.csv
+│       ├── digest_genome
+│       │   └── genome_digestion_statistics.txt
+│       ├── digestion
+│       │   ├── data
+│       │   │   ├── SAMPLE-A_REP1_part0_flashed.digestion.filtered.histogram.csv
+│       │   │   ├── SAMPLE-A_REP1_part0_flashed.digestion.read.summary.csv
+│       │   │   ├── SAMPLE-A_REP1_part0_flashed.digestion.unfiltered.histogram.csv
+│       │   │   ├── SAMPLE-A_REP1_part0_pe.digestion.filtered.histogram.csv
+│       │   │   ├── SAMPLE-A_REP1_part0_pe.digestion.read.summary.csv
+│       │   │   ├── SAMPLE-A_REP1_part0_pe.digestion.unfiltered.histogram.csv
+│       │   │   ├── SAMPLE-A_REP2_part0_flashed.digestion.filtered.histogram.csv
+│       │   │   ├── SAMPLE-A_REP2_part0_flashed.digestion.read.summary.csv
+│       │   │   ├── SAMPLE-A_REP2_part0_flashed.digestion.unfiltered.histogram.csv
+│       │   │   ├── SAMPLE-A_REP2_part0_pe.digestion.filtered.histogram.csv
+│       │   │   ├── SAMPLE-A_REP2_part0_pe.digestion.read.summary.csv
+│       │   │   ├── SAMPLE-A_REP2_part0_pe.digestion.unfiltered.histogram.csv
+│       │   │   ├── SAMPLE-B_REP1_part0_flashed.digestion.filtered.histogram.csv
+│       │   │   ├── SAMPLE-B_REP1_part0_flashed.digestion.read.summary.csv
+│       │   │   ├── SAMPLE-B_REP1_part0_flashed.digestion.unfiltered.histogram.csv
+│       │   │   ├── SAMPLE-B_REP1_part0_pe.digestion.filtered.histogram.csv
+│       │   │   ├── SAMPLE-B_REP1_part0_pe.digestion.read.summary.csv
+│       │   │   ├── SAMPLE-B_REP1_part0_pe.digestion.unfiltered.histogram.csv
+│       │   │   ├── SAMPLE-B_REP2_part0_flashed.digestion.filtered.histogram.csv
+│       │   │   ├── SAMPLE-B_REP2_part0_flashed.digestion.read.summary.csv
+│       │   │   ├── SAMPLE-B_REP2_part0_flashed.digestion.unfiltered.histogram.csv
+│       │   │   ├── SAMPLE-B_REP2_part0_pe.digestion.filtered.histogram.csv
+│       │   │   ├── SAMPLE-B_REP2_part0_pe.digestion.read.summary.csv
+│       │   │   └── SAMPLE-B_REP2_part0_pe.digestion.unfiltered.histogram.csv
+│       │   ├── fastq_digestion.csv
+│       │   └── fastq_digestion.histogram.csv
+│       ├── filtering
+│       │   ├── alignment_filtering.csv
+│       │   ├── alignment_filtering_slice.csv
+│       │   ├── data
+│       │   │   ├── SAMPLE-A_REP1_part0_flashed.read.stats.csv
+│       │   │   ├── SAMPLE-A_REP1_part0_flashed.slice.stats.csv
+│       │   │   ├── SAMPLE-A_REP1_part0_pe.read.stats.csv
+│       │   │   ├── SAMPLE-A_REP1_part0_pe.slice.stats.csv
+│       │   │   ├── SAMPLE-A_REP2_part0_flashed.read.stats.csv
+│       │   │   ├── SAMPLE-A_REP2_part0_flashed.slice.stats.csv
+│       │   │   ├── SAMPLE-A_REP2_part0_pe.read.stats.csv
+│       │   │   ├── SAMPLE-A_REP2_part0_pe.slice.stats.csv
+│       │   │   ├── SAMPLE-B_REP1_part0_flashed.read.stats.csv
+│       │   │   ├── SAMPLE-B_REP1_part0_flashed.slice.stats.csv
+│       │   │   ├── SAMPLE-B_REP1_part0_pe.read.stats.csv
+│       │   │   ├── SAMPLE-B_REP1_part0_pe.slice.stats.csv
+│       │   │   ├── SAMPLE-B_REP2_part0_flashed.read.stats.csv
+│       │   │   ├── SAMPLE-B_REP2_part0_flashed.slice.stats.csv
+│       │   │   ├── SAMPLE-B_REP2_part0_pe.read.stats.csv
+│       │   │   └── SAMPLE-B_REP2_part0_pe.slice.stats.csv
+│       │   └── logs
+│       │       ├── SAMPLE-A_REP1_part0_flashed.log
+│       │       ├── SAMPLE-A_REP1_part0_pe.log
+│       │       ├── SAMPLE-A_REP2_part0_flashed.log
+│       │       ├── SAMPLE-A_REP2_part0_pe.log
+│       │       ├── SAMPLE-B_REP1_part0_flashed.log
+│       │       ├── SAMPLE-B_REP1_part0_pe.log
+│       │       ├── SAMPLE-B_REP2_part0_flashed.log
+│       │       └── SAMPLE-B_REP2_part0_pe.log
+│       ├── filtering_and_alignment_deduplication.csv
+│       └── run_statistics.csv
+├── logs
+│   ├── align
+│   │   ├── SAMPLE-A_REP1_0_flashed.log
+│   │   ├── SAMPLE-A_REP1_0_flashed_sort.log
+│   │   ├── SAMPLE-A_REP1_0_pe.log
+│   │   ├── SAMPLE-A_REP1_0_pe_sort.log
+│   │   ├── SAMPLE-A_REP2_0_flashed.log
+│   │   ├── SAMPLE-A_REP2_0_flashed_sort.log
+│   │   ├── SAMPLE-A_REP2_0_pe.log
+│   │   ├── SAMPLE-A_REP2_0_pe_sort.log
+│   │   ├── SAMPLE-B_REP1_0_flashed.log
+│   │   ├── SAMPLE-B_REP1_0_flashed_sort.log
+│   │   ├── SAMPLE-B_REP1_0_pe.log
+│   │   ├── SAMPLE-B_REP1_0_pe_sort.log
+│   │   ├── SAMPLE-B_REP2_0_flashed.log
+│   │   ├── SAMPLE-B_REP2_0_flashed_sort.log
+│   │   ├── SAMPLE-B_REP2_0_pe.log
+│   │   └── SAMPLE-B_REP2_0_pe_sort.log
+│   ├── annotate
+│   │   ├── SAMPLE-A_REP1
+│   │   │   ├── SAMPLE-A_REP1_part0_flashed.log
+│   │   │   └── SAMPLE-A_REP1_part0_pe.log
+│   │   ├── SAMPLE-A_REP2
+│   │   │   ├── SAMPLE-A_REP2_part0_flashed.log
+│   │   │   └── SAMPLE-A_REP2_part0_pe.log
+│   │   ├── SAMPLE-B_REP1
+│   │   │   ├── SAMPLE-B_REP1_part0_flashed.log
+│   │   │   └── SAMPLE-B_REP1_part0_pe.log
+│   │   └── SAMPLE-B_REP2
+│   │       ├── SAMPLE-B_REP2_part0_flashed.log
+│   │       └── SAMPLE-B_REP2_part0_pe.log
+│   ├── bedgraph_norm
+│   │   ├── SAMPLE-A_REP1_Slc25A37.log
+│   │   ├── SAMPLE-A_REP2_Slc25A37.log
+│   │   ├── SAMPLE-B_REP1_Slc25A37.log
+│   │   └── SAMPLE-B_REP2_Slc25A37.log
+│   ├── bedgraph_raw
+│   │   ├── SAMPLE-A_REP1_Slc25A37.log
+│   │   ├── SAMPLE-A_REP2_Slc25A37.log
+│   │   ├── SAMPLE-B_REP1_Slc25A37.log
+│   │   └── SAMPLE-B_REP2_Slc25A37.log
+│   ├── bedgraph_to_bigwig
+│   │   ├── A-B.mean-subtraction.Slc25A37.log
+│   │   ├── A.mean-summary.Slc25A37.log
+│   │   ├── B-A.mean-subtraction.Slc25A37.log
+│   │   ├── B.mean-summary.Slc25A37.log
+│   │   ├── SAMPLE-A_REP1_norm_Slc25A37.log
+│   │   ├── SAMPLE-A_REP1_raw_Slc25A37.log
+│   │   ├── SAMPLE-A_REP2_norm_Slc25A37.log
+│   │   ├── SAMPLE-A_REP2_raw_Slc25A37.log
+│   │   ├── SAMPLE-B_REP1_norm_Slc25A37.log
+│   │   ├── SAMPLE-B_REP1_raw_Slc25A37.log
+│   │   ├── SAMPLE-B_REP2_norm_Slc25A37.log
+│   │   └── SAMPLE-B_REP2_raw_Slc25A37.log
+│   ├── cis_and_trans_stats
+│   │   ├── SAMPLE-A_REP1.log
+│   │   ├── SAMPLE-A_REP2.log
+│   │   ├── SAMPLE-B_REP1.log
+│   │   └── SAMPLE-B_REP2.log
+│   ├── compare_interactions
+│   │   └── Slc25A37.log
+│   ├── counts
+│   │   ├── SAMPLE-A_REP1.log
+│   │   ├── SAMPLE-A_REP2.log
+│   │   ├── SAMPLE-B_REP1.log
+│   │   └── SAMPLE-B_REP2.log
+│   ├── deduplication_fastq
+│   │   ├── SAMPLE-A_REP1.log
+│   │   ├── SAMPLE-A_REP2.log
+│   │   ├── SAMPLE-B_REP1.log
+│   │   └── SAMPLE-B_REP2.log
+│   ├── differential_interactions
+│   │   └── Slc25A37.log
+│   ├── digestion
+│   │   ├── SAMPLE-A_REP1_0.log
+│   │   ├── SAMPLE-A_REP2_0.log
+│   │   ├── SAMPLE-B_REP1_0.log
+│   │   └── SAMPLE-B_REP2_0.log
+│   ├── fastqc
+│   │   ├── SAMPLE-A_REP1_1.log
+│   │   ├── SAMPLE-A_REP1_2.log
+│   │   ├── SAMPLE-A_REP2_1.log
+│   │   ├── SAMPLE-A_REP2_2.log
+│   │   ├── SAMPLE-B_REP1_1.log
+│   │   ├── SAMPLE-B_REP1_2.log
+│   │   ├── SAMPLE-B_REP2_1.log
+│   │   └── SAMPLE-B_REP2_2.log
+│   ├── flash
+│   │   ├── SAMPLE-A_REP1_0.log
+│   │   ├── SAMPLE-A_REP2_0.log
+│   │   ├── SAMPLE-B_REP1_0.log
+│   │   └── SAMPLE-B_REP2_0.log
+│   ├── make_report.log
+│   ├── merge_counts
+│   │   ├── SAMPLE-A_REP1.log
+│   │   ├── SAMPLE-A_REP2.log
+│   │   ├── SAMPLE-B_REP1.log
+│   │   └── SAMPLE-B_REP2.log
+│   ├── merge_stats_filtering_and_alignment_deduplication.log
+│   ├── multiqc.log
+│   ├── rebalance_partitions
+│   │   ├── SAMPLE-A_REP1_flashed.log
+│   │   ├── SAMPLE-A_REP1_pe.log
+│   │   ├── SAMPLE-A_REP2_flashed.log
+│   │   ├── SAMPLE-A_REP2_pe.log
+│   │   ├── SAMPLE-B_REP1_flashed.log
+│   │   ├── SAMPLE-B_REP1_pe.log
+│   │   ├── SAMPLE-B_REP2_flashed.log
+│   │   └── SAMPLE-B_REP2_pe.log
+│   ├── remove_duplicate_coordinates
+│   │   ├── SAMPLE-A_REP1_flashed.log
+│   │   ├── SAMPLE-A_REP1_pe.log
+│   │   ├── SAMPLE-A_REP2_flashed.log
+│   │   ├── SAMPLE-A_REP2_pe.log
+│   │   ├── SAMPLE-B_REP1_flashed.log
+│   │   ├── SAMPLE-B_REP1_pe.log
+│   │   ├── SAMPLE-B_REP2_flashed.log
+│   │   └── SAMPLE-B_REP2_pe.log
+│   ├── split
+│   │   ├── SAMPLE-A_REP1.log
+│   │   ├── SAMPLE-A_REP2.log
+│   │   ├── SAMPLE-B_REP1.log
+│   │   └── SAMPLE-B_REP2.log
+│   └── trimming
+│       ├── SAMPLE-A_REP1_0.log
+│       ├── SAMPLE-A_REP2_0.log
+│       ├── SAMPLE-B_REP1_0.log
+│       └── SAMPLE-B_REP2_0.log
+├── resources
+│   ├── restriction_fragments
+│   │   ├── genome.digest.bed.gz
+│   │   └── genome.digest.log
+│   └── viewpoints
+│       └── viewpoints.bigBed
+└── results
+    ├── capcruncher_report.html
+    ├── comparisons
+    │   ├── bigwigs
+    │   │   ├── A-B.mean-subtraction.Slc25A37.bigWig
+    │   │   ├── A.mean-summary.Slc25A37.bigWig
+    │   │   ├── B-A.mean-subtraction.Slc25A37.bigWig
+    │   │   └── B.mean-summary.Slc25A37.bigWig
+    │   └── counts_per_viewpoint
+    │       └── norm
+    │           └── Slc25A37.tsv
+    ├── design_matrix.tsv
+    ├── differential
+    │   └── Slc25A37
+    ├── figures
+    │   ├── Slc25A37.pdf
+    │   └── Slc25A37.toml
+    ├── full_qc_report_data
+    │   ├── multiqc_citations.txt
+    │   ├── multiqc_data.json
+    │   ├── multiqc_fastqc.txt
+    │   ├── multiqc_general_stats.txt
+    │   ├── multiqc.log
+    │   ├── multiqc_samtools_stats.txt
+    │   └── multiqc_sources.txt
+    ├── full_qc_report.html
+    ├── SAMPLE-A_REP1
+    │   ├── bigwigs
+    │   │   ├── norm
+    │   │   │   └── SAMPLE-A_REP1_Slc25A37.bigWig
+    │   │   └── raw
+    │   │       └── SAMPLE-A_REP1_Slc25A37.bigWig
+    │   ├── SAMPLE-A_REP1.bam
+    │   ├── SAMPLE-A_REP1.bam.bai
+    │   ├── SAMPLE-A_REP1.hdf5
+    │   └── SAMPLE-A_REP1.parquet
+    │       └── part-0.parquet
+    ├── SAMPLE-A_REP2
+    │   ├── bigwigs
+    │   │   ├── norm
+    │   │   │   └── SAMPLE-A_REP2_Slc25A37.bigWig
+    │   │   └── raw
+    │   │       └── SAMPLE-A_REP2_Slc25A37.bigWig
+    │   ├── SAMPLE-A_REP2.bam
+    │   ├── SAMPLE-A_REP2.bam.bai
+    │   ├── SAMPLE-A_REP2.hdf5
+    │   └── SAMPLE-A_REP2.parquet
+    │       └── part-0.parquet
+    ├── SAMPLE-B_REP1
+    │   ├── bigwigs
+    │   │   ├── norm
+    │   │   │   └── SAMPLE-B_REP1_Slc25A37.bigWig
+    │   │   └── raw
+    │   │       └── SAMPLE-B_REP1_Slc25A37.bigWig
+    │   ├── SAMPLE-B_REP1.bam
+    │   ├── SAMPLE-B_REP1.bam.bai
+    │   ├── SAMPLE-B_REP1.hdf5
+    │   └── SAMPLE-B_REP1.parquet
+    │       └── part-0.parquet
+    └── SAMPLE-B_REP2
+        ├── bigwigs
+        │   ├── norm
+        │   │   └── SAMPLE-B_REP2_Slc25A37.bigWig
+        │   └── raw
+        │       └── SAMPLE-B_REP2_Slc25A37.bigWig
+        ├── SAMPLE-B_REP2.bam
+        ├── SAMPLE-B_REP2.bam.bai
+        ├── SAMPLE-B_REP2.hdf5
+        └── SAMPLE-B_REP2.parquet
+            └── part-0.parquet
+
+142 directories, 324 files
+
+
+

2. Important files

+

The capcruncher_output/results directory contains the following files:

+
    +
  • capcruncher_report.html: the main report of the pipeline. It contains the results of the analysis and the figures.

  • +
  • full_qc_report.html: the full QC report of the pipeline. It contains the results of the QC analysis and the figures.

  • +
  • design_matrix.tsv: the design matrix used.

  • +
  • comparisons: the results of the comparisons between the different conditions. The results are stored in the bigwigs directories (Note: Only for Capture-C and Tri-C)

  • +
  • differential: the results of the differential analysis (Note: Only for Capture-C and Tri-C)

  • +
  • figures: Plots generated by the pipeline. for each viewpoint at the coordinates provided in the configuration file. This also generates templates that can be used with the capcruncher plot make-plot command. See CoolBox API documentation for more parameters that can be used to customize the plots. The current plotting system is a thin wrapper over this library and any parameter specified will be passed to these classes directly.

  • +
  • <SAMPLE_NAME>: the results of the analysis for each sample. The directory contains the following files:

    +
      +
    • bigwigs: the bigwig files of the sample. The files are stored in the raw and norm directories. The raw directory contains the raw bigwig files. The norm directory contains the normalized bigwig files. Note: Only for Capture-C and Tri-C

    • +
    • <SAMPLE_NAME>.bam: the alignment file of the sample.

    • +
    • <SAMPLE_NAME>.bam.bai: the index of the alignment file of the sample.

    • +
    • <SAMPLE_NAME>.hdf5: Cooler formated groups within the HDF5 file per viewpoint. See Cooler documentation for more information on the format.

    • +
    • <SAMPLE_NAME>.parquet: This contains all of the data from the sample in a tabular format that can be accessed by any software that reads parquet files e.g. pandas, polars, Arrow for R.

    • +
  • +
+
+
+
+ +
+
+Note +
+
+
+

Unlike regular cooler.hdf5 files, there are multiple groups per file, one per viewpoint. You can use any cooler supported tool but you will need to specify the group name. For example, to get the matrix for the viewpoint Slc25A37 you can use the following command:

+
cooler dump -t pixels -o Slc25A37.tsv <SAMPLE_NAME>.hdf5::Slc25A37`
+
+
+
+
+ +
+ + +
+ + + + diff --git a/capcruncher_pipeline_alpha_test.qmd b/capcruncher_pipeline_alpha_test.qmd new file mode 100644 index 00000000..4189b6f7 --- /dev/null +++ b/capcruncher_pipeline_alpha_test.qmd @@ -0,0 +1,819 @@ +--- +title: "CapCruncher Pipeline Alpha Testing" +author: "Alastair Smith" +date: today +format: + html: + toc: true + theme: cosmo + embed-resources: true + code-block-background: true + highlight-style: github + code-fold: true +execute: + echo: true + eval: false + +--- + +# Current Status + +This is still in alpha testing so there are still some bugs and missing features. + +## Known issues + +* BUG: Out of jobs ready to be started, but not all files built yet. Please check https://github.com/snakemake/snakemake/issues/823 for more information - Best solution is to just re-run the pipeline. + +* BUG: The pipeline will fail if there are no reads left after filtering. This is a known issue and will be fixed in the next release. + + + + +# Installation - Alpha Testing + +There are two main ways currently to run the new pipeline: + +A) Install into a conda environment and run directly from this. + +B) Install into a very minimal conda env and then run the pipeline using a singularity container. + +## 0. Prerequisites + +### 0.1 Download the GitHub repo development branch + +```bash +git clone https://github.com/sims-lab/CapCruncher.git +cd CapCruncher +git checkout develop +``` + + +## A. Fully featured conda environment + +**Note**: I highly recommend using mamba for the env generation, it is substantially faster than conda. + +```bash +mamba env create -f environment.yml +conda activate cc +``` + +## B. Minimal conda environment + +```bash +mamba create -n cc pip "python==3.10" +conda activate cc +``` + +## Install CapCruncher + +**Note:** Whilst not essential I would install the stats, plotting and experimental dependecies as well as the core package. This is what I do for my own testing. + +```bash +pip install .[stats,plotting,experimental] +``` + +## (Optional) Set-up a Snakemake profile + +This is not essential but it will make running the pipeline much easier by submitting jobs to the cluster automatically and using pre-set parameters. + +**Note:** Cookiecutter is required for this step. This can be installed using `pip install cookiecutter`. + + +### For SLURM based clusters: + +```bash +# create config directory that snakemake searches for profiles (or use something else) +profile_dir="${HOME}/.config/snakemake" +mkdir -p "$profile_dir" +# use cookiecutter to create the profile in the config directory +template="gh:Snakemake-Profiles/slurm" +cookiecutter --output-dir "$profile_dir" "$template" +``` + +### For SGE based clusters: + +**Note:** I have no way of testing this so it may not work. + +```bash +mkdir -p ~/.config/snakemake +cd ~/.config/snakemake +cookiecutter https://github.com/Snakemake-Profiles/sge.git +``` + +### My current profiles look like this: + +``` +/home/a/asmith/.config/snakemake/slurm/ +├── config.yaml +├── CookieCutter.py +├── __pycache__ +│ ├── CookieCutter.cpython-310.pyc +│ ├── CookieCutter.cpython-311.pyc +│ ├── slurm_utils.cpython-310.pyc +│ └── slurm_utils.cpython-311.pyc +├── settings.json +├── slurm-jobscript.sh +├── slurm-sidecar.py +├── slurm-status.py +├── slurm-submit.py +└── slurm_utils.py +``` + +`settings.json`: + +```json +{ + "SBATCH_DEFAULTS": "--partition=short --time=0-01:00:00 --mem=3G", + "CLUSTER_NAME": "", + "CLUSTER_CONFIG": "" +} +``` + +`config.yaml`: + +```yaml + +cluster-sidecar: "slurm-sidecar.py" +cluster-cancel: "scancel" +restart-times: "0" +jobscript: "slurm-jobscript.sh" +cluster: "slurm-submit.py" +cluster-status: "slurm-status.py" +max-jobs-per-second: "10" +max-status-checks-per-second: "10" +local-cores: 1 +latency-wait: "5" +use-conda: "True" +use-singularity: "False" +singularity-args: -B /ceph -B /databank -B $TMPDIR --cleanenv +jobs: "50" +printshellcmds: "True" +retries: 3 + +# Example resource configuration +# default-resources: +# - runtime=100 +# - mem_mb=6000 +# - disk_mb=1000000 +# # set-threads: map rule names to threads +# set-threads: +# - single_core_rule=1 +# - multi_core_rule=10 +# # set-resources: map rule names to resources in general +# set-resources: +# - high_memory_rule:mem_mb=12000 +# - long_running_rule:runtime=1200 + +``` + +**Note**: The singularity-args are required to mount the data directories into the container. e.g. + +```bash +singularity-args: -B /ceph -B /databank +``` + +Gives the container access to the `/ceph` and `/databank` directories on the cluster. The current working directory is also mounted into the container by default. You can add additional directories by adding more `-B` flags. Obviously this will be different for each cluster so you'll need your own defaults. The `$TMPDIR` is also mounted as this causes errors if it is not mounted. The `--cleanenv` flag is also required to prevent the container from inheriting the environment from the host. + +# Pipeline Configuration + + +## 1. Generate a config file + +The pipeline is configured using a YAML file. This file is passed to the pipeline using the `--config` flag. The pipeline will look for a file called `capcruncher_config.yml` in the current working directory by default. This can be overridden using the `--config` flag. + +I've added in a utility to generate a custom config via command line prompts. I would highly recommend using this as the testing is performed using this method and it will ensure that the config is valid. + +```bash +capcruncher pipeline-config +``` + +Just follow the prompts and it will generate a config file for you. Any errors can be corrected by editing the config file directly. + +This will generate a config file called `capcruncher_config.yml` in a folder with the current date, project name and assay type in the current working directory e.g. `2023-05-23-rb-tiled-c`. + +## (Optional) 2. Generate a design file + +**Note**: This is completely optional as the pipeline will do some basic sample name comparison to generate a basic design file. However, this will not be as accurate as a manually generated design file. + +**Note:** I will add an option to automatically generate a design file but this is not yet implemented. + +A design file is optional and only used for comparisons between Capture-C and Tri-C data. It is a tab delimited file with the following columns: + +- `sample`: The name of the FASTQ file (without the _R1.fastq.gz or _2.fastq.gz suffix) +- `condition`: The Group that the sample belongs to. + +Provide the path to this file in the config file under the `design` key. + + +## 3. Ensure that the FASTQ files are in the working directory + +The pipeline will look for FASTQ files in the current working directory. It will look for files with the following suffixes: + +- `_R1.fastq.gz` +- `_2.fastq.gz` + +Eventually a design file will also be able to be used. This is not yet implemented. + +**Note**: Symlinks are also supported. I would recommend using the absolute path to the FASTQ files to avoid any issues. + + +# Running the pipeline + +The pipeline is entirely Snakemake based now so you can see their [documentation](https://snakemake.readthedocs.io/en/stable/) and add any additional flags that you want to use. + +## A. Running the pipeline locally (within the conda environment exclusively) + +```bash +capcruncher pipeline -c +``` + +## B. Running the pipeline on a cluster (within the conda environment exclusively) + +```bash +capcruncher pipeline -c --profile +``` + +Example: + +```bash +capcruncher pipeline -c 10 --profile slurm +``` + +## C. Running the pipeline using the singularity container + +Assuming the profile is set-up correctly this is as simple as: + +```bash +capcruncher pipeline -c --profile --use-singularity +``` + +You can also use the container locally by adding the `--use-singularity` flag to the command. e.g. + +**Note**: You need to add the custom singularity arguments to the command as well to bind the data directories to the container. + +```bash +capcruncher pipeline -c 10 --use-singularity --singularity-args " -B /ceph -B /databank " +``` + +## D. Running the pipeline using the singularity container (without the conda environment) + +I've not tested this yet but it should be possible to just run the pipeline using the singularity container without the conda environment. + +```bash + +singularity pull library://asmith151/capcruncher/capcruncher:latest + +singularity exec -B /ceph -B /databank --cleanenv capcruncher.sif capcruncher pipeline -c 10 --use-singularity --singularity-args " -B /ceph -B /databank " +``` + +## Possible Issues: + +### 1. Singularity container not found + +``` +WorkflowError: +Failed to pull singularity image from library://asmith151/capcruncher/capcruncher:latest +FATAL: Unable to get library client configuration: remote has no library client (see https://apptainer.org/docs/user/latest/endpoint.html#no-default-remote) +``` + +This is caused by the apptainer (the new name for singularity) remote not being set-up for your user. This can be fixed by running the following commands: + +```bash + +apptainer remote add --no-login SylabsCloud cloud.sylabs.io +apptainer remote use SylabsCloud + +``` + +### Memory issues + +Some of the steps in the pipeline require a lot of memory, and may fail. There is a built-in method to retry these steps with more memory and will double with every failed attempt. Change the number of retries in your profile to use more memory. I will try to add a more elegant solution to this in the future. + + + + +# Output + +## 1. Example output + +The pipeline will generate a folder called `capcruncher_output` in the current working directory. This will contain the following files: + +** Note **: This is subject to change as the pipeline is still in development. + +``` +── interim +│ ├── annotate +│ │ └── exclude.bed +│ ├── comparisons +│ │ ├── summaries_and_subtractions +│ │ │ ├── A-B.mean-subtraction.Slc25A37.bedgraph +│ │ │ ├── A.mean-summary.Slc25A37.bedgraph +│ │ │ ├── B-A.mean-subtraction.Slc25A37.bedgraph +│ │ │ └── B.mean-summary.Slc25A37.bedgraph +│ │ ├── summaries_and_subtractionsA-B.mean-subtraction.Slc25A37.bedgraph +│ │ ├── summaries_and_subtractionsA.mean-summary.Slc25A37.bedgraph +│ │ ├── summaries_and_subtractionsB-A.mean-subtraction.Slc25A37.bedgraph +│ │ └── summaries_and_subtractionsB.mean-summary.Slc25A37.bedgraph +│ ├── fastq +│ │ ├── deduplicated +│ │ │ ├── SAMPLE-A_REP1 +│ │ │ │ ├── SAMPLE-A_REP1_part0_1.fastq.gz +│ │ │ │ └── SAMPLE-A_REP1_part0_2.fastq.gz +│ │ │ ├── SAMPLE-A_REP2 +│ │ │ │ ├── SAMPLE-A_REP2_part0_1.fastq.gz +│ │ │ │ └── SAMPLE-A_REP2_part0_2.fastq.gz +│ │ │ ├── SAMPLE-B_REP1 +│ │ │ │ ├── SAMPLE-B_REP1_part0_1.fastq.gz +│ │ │ │ └── SAMPLE-B_REP1_part0_2.fastq.gz +│ │ │ └── SAMPLE-B_REP2 +│ │ │ ├── SAMPLE-B_REP2_part0_1.fastq.gz +│ │ │ └── SAMPLE-B_REP2_part0_2.fastq.gz +│ │ ├── flashed +│ │ │ ├── SAMPLE-A_REP1 +│ │ │ │ ├── SAMPLE-A_REP1_part0.extendedFrags.fastq.gz +│ │ │ │ ├── SAMPLE-A_REP1_part0.hist +│ │ │ │ ├── SAMPLE-A_REP1_part0.histogram +│ │ │ │ ├── SAMPLE-A_REP1_part0.notCombined_1.fastq.gz +│ │ │ │ └── SAMPLE-A_REP1_part0.notCombined_2.fastq.gz +│ │ │ ├── SAMPLE-A_REP2 +│ │ │ │ ├── SAMPLE-A_REP2_part0.extendedFrags.fastq.gz +│ │ │ │ ├── SAMPLE-A_REP2_part0.hist +│ │ │ │ ├── SAMPLE-A_REP2_part0.histogram +│ │ │ │ ├── SAMPLE-A_REP2_part0.notCombined_1.fastq.gz +│ │ │ │ └── SAMPLE-A_REP2_part0.notCombined_2.fastq.gz +│ │ │ ├── SAMPLE-B_REP1 +│ │ │ │ ├── SAMPLE-B_REP1_part0.extendedFrags.fastq.gz +│ │ │ │ ├── SAMPLE-B_REP1_part0.hist +│ │ │ │ ├── SAMPLE-B_REP1_part0.histogram +│ │ │ │ ├── SAMPLE-B_REP1_part0.notCombined_1.fastq.gz +│ │ │ │ └── SAMPLE-B_REP1_part0.notCombined_2.fastq.gz +│ │ │ └── SAMPLE-B_REP2 +│ │ │ ├── SAMPLE-B_REP2_part0.extendedFrags.fastq.gz +│ │ │ ├── SAMPLE-B_REP2_part0.hist +│ │ │ ├── SAMPLE-B_REP2_part0.histogram +│ │ │ ├── SAMPLE-B_REP2_part0.notCombined_1.fastq.gz +│ │ │ └── SAMPLE-B_REP2_part0.notCombined_2.fastq.gz +│ │ ├── rebalanced +│ │ │ ├── SAMPLE-A_REP1 +│ │ │ │ ├── flashed +│ │ │ │ │ └── SAMPLE-A_REP1_part0_flashed_1.fastq.gz +│ │ │ │ └── pe +│ │ │ │ ├── SAMPLE-A_REP1_part0_pe_1.fastq.gz +│ │ │ │ └── SAMPLE-A_REP1_part0_pe_2.fastq.gz +│ │ │ ├── SAMPLE-A_REP2 +│ │ │ │ ├── flashed +│ │ │ │ │ └── SAMPLE-A_REP2_part0_flashed_1.fastq.gz +│ │ │ │ └── pe +│ │ │ │ ├── SAMPLE-A_REP2_part0_pe_1.fastq.gz +│ │ │ │ └── SAMPLE-A_REP2_part0_pe_2.fastq.gz +│ │ │ ├── SAMPLE-B_REP1 +│ │ │ │ ├── flashed +│ │ │ │ │ └── SAMPLE-B_REP1_part0_flashed_1.fastq.gz +│ │ │ │ └── pe +│ │ │ │ ├── SAMPLE-B_REP1_part0_pe_1.fastq.gz +│ │ │ │ └── SAMPLE-B_REP1_part0_pe_2.fastq.gz +│ │ │ └── SAMPLE-B_REP2 +│ │ │ ├── flashed +│ │ │ │ └── SAMPLE-B_REP2_part0_flashed_1.fastq.gz +│ │ │ └── pe +│ │ │ ├── SAMPLE-B_REP2_part0_pe_1.fastq.gz +│ │ │ └── SAMPLE-B_REP2_part0_pe_2.fastq.gz +│ │ ├── split +│ │ │ ├── SAMPLE-A_REP1 +│ │ │ │ ├── SAMPLE-A_REP1_part0_1.fastq.gz +│ │ │ │ └── SAMPLE-A_REP1_part0_2.fastq.gz +│ │ │ ├── SAMPLE-A_REP2 +│ │ │ │ ├── SAMPLE-A_REP2_part0_1.fastq.gz +│ │ │ │ └── SAMPLE-A_REP2_part0_2.fastq.gz +│ │ │ ├── SAMPLE-B_REP1 +│ │ │ │ ├── SAMPLE-B_REP1_part0_1.fastq.gz +│ │ │ │ └── SAMPLE-B_REP1_part0_2.fastq.gz +│ │ │ └── SAMPLE-B_REP2 +│ │ │ ├── SAMPLE-B_REP2_part0_1.fastq.gz +│ │ │ └── SAMPLE-B_REP2_part0_2.fastq.gz +│ │ └── trimmed +│ │ ├── SAMPLE-A_REP1 +│ │ │ ├── SAMPLE-A_REP1_part0_1.fastq.gz_trimming_report.txt +│ │ │ └── SAMPLE-A_REP1_part0_2.fastq.gz_trimming_report.txt +│ │ ├── SAMPLE-A_REP2 +│ │ │ ├── SAMPLE-A_REP2_part0_1.fastq.gz_trimming_report.txt +│ │ │ └── SAMPLE-A_REP2_part0_2.fastq.gz_trimming_report.txt +│ │ ├── SAMPLE-B_REP1 +│ │ │ ├── SAMPLE-B_REP1_part0_1.fastq.gz_trimming_report.txt +│ │ │ └── SAMPLE-B_REP1_part0_2.fastq.gz_trimming_report.txt +│ │ └── SAMPLE-B_REP2 +│ │ ├── SAMPLE-B_REP2_part0_1.fastq.gz_trimming_report.txt +│ │ └── SAMPLE-B_REP2_part0_2.fastq.gz_trimming_report.txt +│ ├── filtering +│ │ └── repartitioned +│ │ ├── SAMPLE-A_REP1 +│ │ │ ├── flashed +│ │ │ │ └── SAMPLE-A_REP1_part0_flashed.slices.parquet +│ │ │ └── pe +│ │ │ └── SAMPLE-A_REP1_part0_pe.slices.parquet +│ │ ├── SAMPLE-A_REP2 +│ │ │ ├── flashed +│ │ │ │ └── SAMPLE-A_REP2_part0_flashed.slices.parquet +│ │ │ └── pe +│ │ │ └── SAMPLE-A_REP2_part0_pe.slices.parquet +│ │ ├── SAMPLE-B_REP1 +│ │ │ ├── flashed +│ │ │ │ └── SAMPLE-B_REP1_part0_flashed.slices.parquet +│ │ │ └── pe +│ │ │ └── SAMPLE-B_REP1_part0_pe.slices.parquet +│ │ └── SAMPLE-B_REP2 +│ │ ├── flashed +│ │ │ └── SAMPLE-B_REP2_part0_flashed.slices.parquet +│ │ └── pe +│ │ └── SAMPLE-B_REP2_part0_pe.slices.parquet +│ ├── pileups +│ │ └── bedgraphs +│ │ ├── SAMPLE-A_REP1 +│ │ │ ├── norm +│ │ │ │ └── SAMPLE-A_REP1_Slc25A37.bedgraph +│ │ │ └── raw +│ │ │ └── SAMPLE-A_REP1_Slc25A37.bedgraph +│ │ ├── SAMPLE-A_REP2 +│ │ │ ├── norm +│ │ │ │ └── SAMPLE-A_REP2_Slc25A37.bedgraph +│ │ │ └── raw +│ │ │ └── SAMPLE-A_REP2_Slc25A37.bedgraph +│ │ ├── SAMPLE-B_REP1 +│ │ │ ├── norm +│ │ │ │ └── SAMPLE-B_REP1_Slc25A37.bedgraph +│ │ │ └── raw +│ │ │ └── SAMPLE-B_REP1_Slc25A37.bedgraph +│ │ └── SAMPLE-B_REP2 +│ │ ├── norm +│ │ │ └── SAMPLE-B_REP2_Slc25A37.bedgraph +│ │ └── raw +│ │ └── SAMPLE-B_REP2_Slc25A37.bedgraph +│ ├── qc +│ │ ├── alignment_raw +│ │ │ ├── SAMPLE-A_REP1.txt +│ │ │ ├── SAMPLE-A_REP2.txt +│ │ │ ├── SAMPLE-B_REP1.txt +│ │ │ └── SAMPLE-B_REP2.txt +│ │ └── fastqc +│ │ ├── SAMPLE-A_REP1_1_fastqc.html +│ │ ├── SAMPLE-A_REP1_1_fastqc.zip +│ │ ├── SAMPLE-A_REP1_2_fastqc.html +│ │ ├── SAMPLE-A_REP1_2_fastqc.zip +│ │ ├── SAMPLE-A_REP2_1_fastqc.html +│ │ ├── SAMPLE-A_REP2_1_fastqc.zip +│ │ ├── SAMPLE-A_REP2_2_fastqc.html +│ │ ├── SAMPLE-A_REP2_2_fastqc.zip +│ │ ├── SAMPLE-B_REP1_1_fastqc.html +│ │ ├── SAMPLE-B_REP1_1_fastqc.zip +│ │ ├── SAMPLE-B_REP1_2_fastqc.html +│ │ ├── SAMPLE-B_REP1_2_fastqc.zip +│ │ ├── SAMPLE-B_REP2_1_fastqc.html +│ │ ├── SAMPLE-B_REP2_1_fastqc.zip +│ │ ├── SAMPLE-B_REP2_2_fastqc.html +│ │ └── SAMPLE-B_REP2_2_fastqc.zip +│ └── statistics +│ ├── cis_and_trans_reporters +│ │ ├── cis_and_trans_reporters.csv +│ │ └── data +│ │ ├── SAMPLE-A_REP1.reporter.stats.csv +│ │ ├── SAMPLE-A_REP2.reporter.stats.csv +│ │ ├── SAMPLE-B_REP1.reporter.stats.csv +│ │ └── SAMPLE-B_REP2.reporter.stats.csv +│ ├── deduplication +│ │ ├── data +│ │ │ ├── SAMPLE-A_REP1 +│ │ │ ├── SAMPLE-A_REP1.deduplication.csv +│ │ │ ├── SAMPLE-A_REP2 +│ │ │ ├── SAMPLE-A_REP2.deduplication.csv +│ │ │ ├── SAMPLE-B_REP1 +│ │ │ ├── SAMPLE-B_REP1.deduplication.csv +│ │ │ ├── SAMPLE-B_REP2 +│ │ │ └── SAMPLE-B_REP2.deduplication.csv +│ │ └── fastq_deduplication.csv +│ ├── deduplication_by_coordinate +│ │ ├── alignment_deduplication.csv +│ │ └── data +│ │ ├── SAMPLE-A_REP1_flashed.read.stats.csv +│ │ ├── SAMPLE-A_REP1_pe.read.stats.csv +│ │ ├── SAMPLE-A_REP2_flashed.read.stats.csv +│ │ ├── SAMPLE-A_REP2_pe.read.stats.csv +│ │ ├── SAMPLE-B_REP1_flashed.read.stats.csv +│ │ ├── SAMPLE-B_REP1_pe.read.stats.csv +│ │ ├── SAMPLE-B_REP2_flashed.read.stats.csv +│ │ └── SAMPLE-B_REP2_pe.read.stats.csv +│ ├── digest_genome +│ │ └── genome_digestion_statistics.txt +│ ├── digestion +│ │ ├── data +│ │ │ ├── SAMPLE-A_REP1_part0_flashed.digestion.filtered.histogram.csv +│ │ │ ├── SAMPLE-A_REP1_part0_flashed.digestion.read.summary.csv +│ │ │ ├── SAMPLE-A_REP1_part0_flashed.digestion.unfiltered.histogram.csv +│ │ │ ├── SAMPLE-A_REP1_part0_pe.digestion.filtered.histogram.csv +│ │ │ ├── SAMPLE-A_REP1_part0_pe.digestion.read.summary.csv +│ │ │ ├── SAMPLE-A_REP1_part0_pe.digestion.unfiltered.histogram.csv +│ │ │ ├── SAMPLE-A_REP2_part0_flashed.digestion.filtered.histogram.csv +│ │ │ ├── SAMPLE-A_REP2_part0_flashed.digestion.read.summary.csv +│ │ │ ├── SAMPLE-A_REP2_part0_flashed.digestion.unfiltered.histogram.csv +│ │ │ ├── SAMPLE-A_REP2_part0_pe.digestion.filtered.histogram.csv +│ │ │ ├── SAMPLE-A_REP2_part0_pe.digestion.read.summary.csv +│ │ │ ├── SAMPLE-A_REP2_part0_pe.digestion.unfiltered.histogram.csv +│ │ │ ├── SAMPLE-B_REP1_part0_flashed.digestion.filtered.histogram.csv +│ │ │ ├── SAMPLE-B_REP1_part0_flashed.digestion.read.summary.csv +│ │ │ ├── SAMPLE-B_REP1_part0_flashed.digestion.unfiltered.histogram.csv +│ │ │ ├── SAMPLE-B_REP1_part0_pe.digestion.filtered.histogram.csv +│ │ │ ├── SAMPLE-B_REP1_part0_pe.digestion.read.summary.csv +│ │ │ ├── SAMPLE-B_REP1_part0_pe.digestion.unfiltered.histogram.csv +│ │ │ ├── SAMPLE-B_REP2_part0_flashed.digestion.filtered.histogram.csv +│ │ │ ├── SAMPLE-B_REP2_part0_flashed.digestion.read.summary.csv +│ │ │ ├── SAMPLE-B_REP2_part0_flashed.digestion.unfiltered.histogram.csv +│ │ │ ├── SAMPLE-B_REP2_part0_pe.digestion.filtered.histogram.csv +│ │ │ ├── SAMPLE-B_REP2_part0_pe.digestion.read.summary.csv +│ │ │ └── SAMPLE-B_REP2_part0_pe.digestion.unfiltered.histogram.csv +│ │ ├── fastq_digestion.csv +│ │ └── fastq_digestion.histogram.csv +│ ├── filtering +│ │ ├── alignment_filtering.csv +│ │ ├── alignment_filtering_slice.csv +│ │ ├── data +│ │ │ ├── SAMPLE-A_REP1_part0_flashed.read.stats.csv +│ │ │ ├── SAMPLE-A_REP1_part0_flashed.slice.stats.csv +│ │ │ ├── SAMPLE-A_REP1_part0_pe.read.stats.csv +│ │ │ ├── SAMPLE-A_REP1_part0_pe.slice.stats.csv +│ │ │ ├── SAMPLE-A_REP2_part0_flashed.read.stats.csv +│ │ │ ├── SAMPLE-A_REP2_part0_flashed.slice.stats.csv +│ │ │ ├── SAMPLE-A_REP2_part0_pe.read.stats.csv +│ │ │ ├── SAMPLE-A_REP2_part0_pe.slice.stats.csv +│ │ │ ├── SAMPLE-B_REP1_part0_flashed.read.stats.csv +│ │ │ ├── SAMPLE-B_REP1_part0_flashed.slice.stats.csv +│ │ │ ├── SAMPLE-B_REP1_part0_pe.read.stats.csv +│ │ │ ├── SAMPLE-B_REP1_part0_pe.slice.stats.csv +│ │ │ ├── SAMPLE-B_REP2_part0_flashed.read.stats.csv +│ │ │ ├── SAMPLE-B_REP2_part0_flashed.slice.stats.csv +│ │ │ ├── SAMPLE-B_REP2_part0_pe.read.stats.csv +│ │ │ └── SAMPLE-B_REP2_part0_pe.slice.stats.csv +│ │ └── logs +│ │ ├── SAMPLE-A_REP1_part0_flashed.log +│ │ ├── SAMPLE-A_REP1_part0_pe.log +│ │ ├── SAMPLE-A_REP2_part0_flashed.log +│ │ ├── SAMPLE-A_REP2_part0_pe.log +│ │ ├── SAMPLE-B_REP1_part0_flashed.log +│ │ ├── SAMPLE-B_REP1_part0_pe.log +│ │ ├── SAMPLE-B_REP2_part0_flashed.log +│ │ └── SAMPLE-B_REP2_part0_pe.log +│ ├── filtering_and_alignment_deduplication.csv +│ └── run_statistics.csv +├── logs +│ ├── align +│ │ ├── SAMPLE-A_REP1_0_flashed.log +│ │ ├── SAMPLE-A_REP1_0_flashed_sort.log +│ │ ├── SAMPLE-A_REP1_0_pe.log +│ │ ├── SAMPLE-A_REP1_0_pe_sort.log +│ │ ├── SAMPLE-A_REP2_0_flashed.log +│ │ ├── SAMPLE-A_REP2_0_flashed_sort.log +│ │ ├── SAMPLE-A_REP2_0_pe.log +│ │ ├── SAMPLE-A_REP2_0_pe_sort.log +│ │ ├── SAMPLE-B_REP1_0_flashed.log +│ │ ├── SAMPLE-B_REP1_0_flashed_sort.log +│ │ ├── SAMPLE-B_REP1_0_pe.log +│ │ ├── SAMPLE-B_REP1_0_pe_sort.log +│ │ ├── SAMPLE-B_REP2_0_flashed.log +│ │ ├── SAMPLE-B_REP2_0_flashed_sort.log +│ │ ├── SAMPLE-B_REP2_0_pe.log +│ │ └── SAMPLE-B_REP2_0_pe_sort.log +│ ├── annotate +│ │ ├── SAMPLE-A_REP1 +│ │ │ ├── SAMPLE-A_REP1_part0_flashed.log +│ │ │ └── SAMPLE-A_REP1_part0_pe.log +│ │ ├── SAMPLE-A_REP2 +│ │ │ ├── SAMPLE-A_REP2_part0_flashed.log +│ │ │ └── SAMPLE-A_REP2_part0_pe.log +│ │ ├── SAMPLE-B_REP1 +│ │ │ ├── SAMPLE-B_REP1_part0_flashed.log +│ │ │ └── SAMPLE-B_REP1_part0_pe.log +│ │ └── SAMPLE-B_REP2 +│ │ ├── SAMPLE-B_REP2_part0_flashed.log +│ │ └── SAMPLE-B_REP2_part0_pe.log +│ ├── bedgraph_norm +│ │ ├── SAMPLE-A_REP1_Slc25A37.log +│ │ ├── SAMPLE-A_REP2_Slc25A37.log +│ │ ├── SAMPLE-B_REP1_Slc25A37.log +│ │ └── SAMPLE-B_REP2_Slc25A37.log +│ ├── bedgraph_raw +│ │ ├── SAMPLE-A_REP1_Slc25A37.log +│ │ ├── SAMPLE-A_REP2_Slc25A37.log +│ │ ├── SAMPLE-B_REP1_Slc25A37.log +│ │ └── SAMPLE-B_REP2_Slc25A37.log +│ ├── bedgraph_to_bigwig +│ │ ├── A-B.mean-subtraction.Slc25A37.log +│ │ ├── A.mean-summary.Slc25A37.log +│ │ ├── B-A.mean-subtraction.Slc25A37.log +│ │ ├── B.mean-summary.Slc25A37.log +│ │ ├── SAMPLE-A_REP1_norm_Slc25A37.log +│ │ ├── SAMPLE-A_REP1_raw_Slc25A37.log +│ │ ├── SAMPLE-A_REP2_norm_Slc25A37.log +│ │ ├── SAMPLE-A_REP2_raw_Slc25A37.log +│ │ ├── SAMPLE-B_REP1_norm_Slc25A37.log +│ │ ├── SAMPLE-B_REP1_raw_Slc25A37.log +│ │ ├── SAMPLE-B_REP2_norm_Slc25A37.log +│ │ └── SAMPLE-B_REP2_raw_Slc25A37.log +│ ├── cis_and_trans_stats +│ │ ├── SAMPLE-A_REP1.log +│ │ ├── SAMPLE-A_REP2.log +│ │ ├── SAMPLE-B_REP1.log +│ │ └── SAMPLE-B_REP2.log +│ ├── compare_interactions +│ │ └── Slc25A37.log +│ ├── counts +│ │ ├── SAMPLE-A_REP1.log +│ │ ├── SAMPLE-A_REP2.log +│ │ ├── SAMPLE-B_REP1.log +│ │ └── SAMPLE-B_REP2.log +│ ├── deduplication_fastq +│ │ ├── SAMPLE-A_REP1.log +│ │ ├── SAMPLE-A_REP2.log +│ │ ├── SAMPLE-B_REP1.log +│ │ └── SAMPLE-B_REP2.log +│ ├── differential_interactions +│ │ └── Slc25A37.log +│ ├── digestion +│ │ ├── SAMPLE-A_REP1_0.log +│ │ ├── SAMPLE-A_REP2_0.log +│ │ ├── SAMPLE-B_REP1_0.log +│ │ └── SAMPLE-B_REP2_0.log +│ ├── fastqc +│ │ ├── SAMPLE-A_REP1_1.log +│ │ ├── SAMPLE-A_REP1_2.log +│ │ ├── SAMPLE-A_REP2_1.log +│ │ ├── SAMPLE-A_REP2_2.log +│ │ ├── SAMPLE-B_REP1_1.log +│ │ ├── SAMPLE-B_REP1_2.log +│ │ ├── SAMPLE-B_REP2_1.log +│ │ └── SAMPLE-B_REP2_2.log +│ ├── flash +│ │ ├── SAMPLE-A_REP1_0.log +│ │ ├── SAMPLE-A_REP2_0.log +│ │ ├── SAMPLE-B_REP1_0.log +│ │ └── SAMPLE-B_REP2_0.log +│ ├── make_report.log +│ ├── merge_counts +│ │ ├── SAMPLE-A_REP1.log +│ │ ├── SAMPLE-A_REP2.log +│ │ ├── SAMPLE-B_REP1.log +│ │ └── SAMPLE-B_REP2.log +│ ├── merge_stats_filtering_and_alignment_deduplication.log +│ ├── multiqc.log +│ ├── rebalance_partitions +│ │ ├── SAMPLE-A_REP1_flashed.log +│ │ ├── SAMPLE-A_REP1_pe.log +│ │ ├── SAMPLE-A_REP2_flashed.log +│ │ ├── SAMPLE-A_REP2_pe.log +│ │ ├── SAMPLE-B_REP1_flashed.log +│ │ ├── SAMPLE-B_REP1_pe.log +│ │ ├── SAMPLE-B_REP2_flashed.log +│ │ └── SAMPLE-B_REP2_pe.log +│ ├── remove_duplicate_coordinates +│ │ ├── SAMPLE-A_REP1_flashed.log +│ │ ├── SAMPLE-A_REP1_pe.log +│ │ ├── SAMPLE-A_REP2_flashed.log +│ │ ├── SAMPLE-A_REP2_pe.log +│ │ ├── SAMPLE-B_REP1_flashed.log +│ │ ├── SAMPLE-B_REP1_pe.log +│ │ ├── SAMPLE-B_REP2_flashed.log +│ │ └── SAMPLE-B_REP2_pe.log +│ ├── split +│ │ ├── SAMPLE-A_REP1.log +│ │ ├── SAMPLE-A_REP2.log +│ │ ├── SAMPLE-B_REP1.log +│ │ └── SAMPLE-B_REP2.log +│ └── trimming +│ ├── SAMPLE-A_REP1_0.log +│ ├── SAMPLE-A_REP2_0.log +│ ├── SAMPLE-B_REP1_0.log +│ └── SAMPLE-B_REP2_0.log +├── resources +│ ├── restriction_fragments +│ │ ├── genome.digest.bed.gz +│ │ └── genome.digest.log +│ └── viewpoints +│ └── viewpoints.bigBed +└── results + ├── capcruncher_report.html + ├── comparisons + │ ├── bigwigs + │ │ ├── A-B.mean-subtraction.Slc25A37.bigWig + │ │ ├── A.mean-summary.Slc25A37.bigWig + │ │ ├── B-A.mean-subtraction.Slc25A37.bigWig + │ │ └── B.mean-summary.Slc25A37.bigWig + │ └── counts_per_viewpoint + │ └── norm + │ └── Slc25A37.tsv + ├── design_matrix.tsv + ├── differential + │ └── Slc25A37 + ├── figures + │ ├── Slc25A37.pdf + │ └── Slc25A37.toml + ├── full_qc_report_data + │ ├── multiqc_citations.txt + │ ├── multiqc_data.json + │ ├── multiqc_fastqc.txt + │ ├── multiqc_general_stats.txt + │ ├── multiqc.log + │ ├── multiqc_samtools_stats.txt + │ └── multiqc_sources.txt + ├── full_qc_report.html + ├── SAMPLE-A_REP1 + │ ├── bigwigs + │ │ ├── norm + │ │ │ └── SAMPLE-A_REP1_Slc25A37.bigWig + │ │ └── raw + │ │ └── SAMPLE-A_REP1_Slc25A37.bigWig + │ ├── SAMPLE-A_REP1.bam + │ ├── SAMPLE-A_REP1.bam.bai + │ ├── SAMPLE-A_REP1.hdf5 + │ └── SAMPLE-A_REP1.parquet + │ └── part-0.parquet + ├── SAMPLE-A_REP2 + │ ├── bigwigs + │ │ ├── norm + │ │ │ └── SAMPLE-A_REP2_Slc25A37.bigWig + │ │ └── raw + │ │ └── SAMPLE-A_REP2_Slc25A37.bigWig + │ ├── SAMPLE-A_REP2.bam + │ ├── SAMPLE-A_REP2.bam.bai + │ ├── SAMPLE-A_REP2.hdf5 + │ └── SAMPLE-A_REP2.parquet + │ └── part-0.parquet + ├── SAMPLE-B_REP1 + │ ├── bigwigs + │ │ ├── norm + │ │ │ └── SAMPLE-B_REP1_Slc25A37.bigWig + │ │ └── raw + │ │ └── SAMPLE-B_REP1_Slc25A37.bigWig + │ ├── SAMPLE-B_REP1.bam + │ ├── SAMPLE-B_REP1.bam.bai + │ ├── SAMPLE-B_REP1.hdf5 + │ └── SAMPLE-B_REP1.parquet + │ └── part-0.parquet + └── SAMPLE-B_REP2 + ├── bigwigs + │ ├── norm + │ │ └── SAMPLE-B_REP2_Slc25A37.bigWig + │ └── raw + │ └── SAMPLE-B_REP2_Slc25A37.bigWig + ├── SAMPLE-B_REP2.bam + ├── SAMPLE-B_REP2.bam.bai + ├── SAMPLE-B_REP2.hdf5 + └── SAMPLE-B_REP2.parquet + └── part-0.parquet + +142 directories, 324 files +``` + +## 2. Important files + +The `capcruncher_output/results` directory contains the following files: + +- `capcruncher_report.html`: the main report of the pipeline. It contains the + results of the analysis and the figures. + +- `full_qc_report.html`: the full QC report of the pipeline. It contains the + results of the QC analysis and the figures. + +- `design_matrix.tsv`: the design matrix used. + +- `comparisons`: the results of the comparisons between the different + conditions. The results are stored in the `bigwigs` + directories (**Note**: Only for Capture-C and Tri-C) + +- `differential`: the results of the differential analysis (**Note**: Only for + Capture-C and Tri-C) + +- `figures`: Plots generated by the pipeline. for each viewpoint at the coordinates provided in the configuration file. + This also generates templates that can be used with the `capcruncher plot make-plot` command. See [CoolBox API documentation](https://gangcaolab.github.io/CoolBox/api.html) for more parameters that can be used to customize the plots. The current plotting system is a thin wrapper over this library and any parameter specified will be passed to these classes directly. + +- ``: the results of the analysis for each sample. The directory contains the following files: + + - `bigwigs`: the bigwig files of the sample. The files are stored in the + `raw` and `norm` directories. The `raw` directory contains the raw + bigwig files. The `norm` directory contains the normalized bigwig files. + **Note**: Only for Capture-C and Tri-C + + - `.bam`: the alignment file of the sample. + + - `.bam.bai`: the index of the alignment file of the sample. + + - `.hdf5`: Cooler formated groups within the HDF5 file per viewpoint. See [Cooler documentation](https://cooler.readthedocs.io/en/latest/) for more information on the format. + + - `.parquet`: This contains all of the data from the sample in a tabular format that can be accessed by any software that reads parquet files e.g. `pandas`, `polars`, [Arrow for R](https://github.com/apache/arrow/tree/main/r). + + +::: {.callout-note} + Unlike regular cooler.hdf5 files, there are multiple groups per file, one per viewpoint. You can use any cooler supported tool but you will need to specify the group name. For example, to get the matrix for the viewpoint `Slc25A37` you can use the following command: + + ```bash + cooler dump -t pixels -o Slc25A37.tsv .hdf5::Slc25A37` + + ``` + +::: diff --git a/config.yml b/config.yml index 0063a724..ffc4d98c 100755 --- a/config.yml +++ b/config.yml @@ -3,49 +3,49 @@ ################################### analysis: - - # Method to use for data analysis, choose from capture | tri | tiled. + + # Method to use for data analysis, choose from capture | tri | tiled. # (Required) - method: ANALYSIS_METHOD - + method: ANALYSIS_METHOD + # Path to viewpoints used for analysis. # This is a bed file containing the coordinates of the captured restricion fragments. # The file must be in four column bed format with the name in the last column. - # (Required) + # (Required) viewpoints: PATH_TO_VIEWPOINTS - + # Restriction enzyme name or recognition site, *not* case sensitive # e.g. DpnII | dpnii | GATC # (Required) restriction_enzyme: dpnii - + # Number of basepairs to exclude around each capture probe to remove re-ligations # (Required) reporter_exclusion_zone: 1000 - # Path to design matrix describing the experimental design. + # Path to design matrix describing the experimental design. # This must have two columns: sample condition # e.g. sample condition # SAMPLE1 DMSO - # SAMPLE2 DMSO + # SAMPLE2 DMSO # If this is not provided, pattern matching will be used to determine the experimental design. # In this case ensure that your FASTQ file names follow the pattern: SAMPLE-NAME-WITH-CONDITION_REPLICATE_[12].fastq(.gz). - # (Optional) + # (Optional) design: PATH_TO_TSV_FORMATTED_DESIGN_MATRIX # Genomic window size(s) (use spaces to separate bin sizes) to use for binning restriction fragment interaction counts - # into even genomic windows. Only bin sizes that are present here will be allowed + # into even genomic windows. Only bin sizes that are present here will be allowed # to be used for heatmap generation. # (Optional) Leave blank to prevent binning bin_sizes: BIN_SIZES genome: - + # Name of genome. UCSC genome names are prefered. Custom names are accepted if chrom_sizes are provided # (Required) name: mm9 - - # Path to fasta file containing entire genome sequence separated by chromosome. + + # Path to fasta file containing entire genome sequence separated by chromosome. # (Required) fasta: PATH_TO_GENOME_FASTA @@ -55,7 +55,7 @@ genome: # (Required) aligner_index: PATH_TO_ALIGNER_INDICIES - # Path to chromosome sizes for genome. + # Path to chromosome sizes for genome. # If blank will be determined automatically from the genome (must be a UCSC genome) # This should be a two column tsv file with columns: chromosome_name size # FAI files can also be used. @@ -88,7 +88,7 @@ pipeline: # Name of queue to use. This will be cluster specific. # (Required if using in cluster mode) cluster_queue: batch - + # Maximum number of cores to use per task. (~ 4 works best) # (Required if using in cluster mode) n_cores: 4 @@ -102,7 +102,7 @@ pipeline: # A value of 0 indicates that compression will not be performed at all stages. # It is highly recomended to avoid compression if possible as intermediate files will be removed to reduce space # (Required) - compression: 0 + compression: 5 ################################### @@ -110,14 +110,14 @@ pipeline: ################################### align: - + # Aligner to use. Both bowtie and bowtie2 are supported but bowtie2 is prefered. # (Required) aligner: bowtie2 # Flag to specify index for the aligner. Leave blank if this is not present i.e. bowtie # (Required) - index_flag: -x + index_flag: -x # Aligner specific options. (Take care as this will ignore the number of cores specified earlier). # (Optional) @@ -132,11 +132,11 @@ analysis_optional: # Path to a YAML file containing the filter order to be applied to slices. # See https://github.com/sims-lab/CapCruncher/blob/master/capcruncher/data/test/ccslicefilter_test.yml - # for an example and the filter module of the documentation - # https://capcruncher.readthedocs.io/en/latest/capcruncher_module_documentation/capcruncher.tools.html#capcruncher-tools-filter-module + # for an example and the filter module of the documentation + # https://capcruncher.readthedocs.io/en/latest/capcruncher_module_documentation/capcruncher.tools.html#capcruncher-tools-filter-module # for further details. custom_filtering: PATH_TO_CUSTOM_FILTERING_CONFIG - + # Path to blacklisted regions bed file. Must be a four column named bed file. # Can supply any regions to be removed. # (Optional) @@ -157,7 +157,7 @@ analysis_optional: # Determines if trimmed fastq files are erased after read combining (FLASh). # (Debugging) keep_trimmed: False - + # Determines if digested fastq files are erased after alignment. # (Debugging) keep_digested: False @@ -181,60 +181,60 @@ analysis_optional: # Options: tsv, hdf5, parquet # Highly recomended to leave this as the default as other options may not be as robust # (Optional) - storage_format: parquet - + storage_format: parquet + deduplication: # Turns on initial removal of identical reads from fastq file # Dramatically enhances the speed of the pipeline after this step has completed. # (Optional) - pre-dedup: True + pre-dedup: True hub: - + # Determines if hub is created or not. # True|False - # (Required) + # (Required) create: True - + # Url/IP of server to host bigWigs # e.g. http://userweb.molbiol.ox.ac.uk/ # (Required for hub) url: URL - + # Location of publically accessible location on the server # The directory will be created if it does not exist # (Required for hub) - dir: PATH_TO_HUB_DIRECTORY - + dir: PATH_TO_HUB_DIRECTORY + # Name for the hub # (Required for hub) - name: HUB_NAME - + name: HUB_NAME + # Short hub name # (Required for hub) - short: SHORT_HUB_DESCRIPTION - + short: SHORT_HUB_DESCRIPTION + # Long hub name # (Required for hub) - long: LONGER_HUB_DESCRIPTION - + long: LONGER_HUB_DESCRIPTION + # Email address # (Required for hub) email: EMAIL_ADDRESS - + normalisation: - + # Scaling factor for normalisation. - # Current normalisation formula: + # Current normalisation formula: # (REPORTER_COUNT / N_CIS_REPORTERS) * SCALE_FACTOR - # (Required) + # (Required) scale_factor: 1000000 - + # Regions to use for normalising viewpoint pileups. # To use this method, provide the path to a bed file with the inverval(s) - # to be used for normalising *each* viewpoint. All viewpoints must have at least + # to be used for normalising *each* viewpoint. All viewpoints must have at least # one region specified. regions: REGIONS_FOR_NORM @@ -247,7 +247,7 @@ plot: coordinates: PATH_TO_PLOTTING_COORDINATES # Normalisation method to use for the plot. Leave blank for default based on analysis method. - # Choose from: + # Choose from: # * raw - No normalisation # * n_interactions - Normalised based on the number of cis interactions # * n_rf_n_interactions - Normalised based on the number of cis interations and the number of restriction fragments per bin @@ -264,14 +264,14 @@ trim: # Options passed to trim_galore # (Optional) - options: --length 21 + options: --length 21 split: # Fastq files are split for parallel processing. Defines number of reads per fastq file (lower = more files to process) # For Tiled-C or highly enriched assays it is advised to reduce this to 1x10^6 readpairs to speed up the analysis. # The pipeline will cope with a high number of reads but it will run slower. - # (Optional) + # (Optional) n_reads: 1000000 compare: diff --git a/conftest.py b/conftest.py new file mode 100644 index 00000000..a6411e63 --- /dev/null +++ b/conftest.py @@ -0,0 +1,10 @@ +import pytest + + +def pytest_addoption(parser): + parser.addoption("--cores") + + +@pytest.fixture(scope='session', autouse=True) +def cores(request): + return request.config.getoption("--cores") diff --git a/docs/_static/capcruncher_statistics.html b/docs/_static/capcruncher_statistics.html index 42320570..3e4b60b3 100644 --- a/docs/_static/capcruncher_statistics.html +++ b/docs/_static/capcruncher_statistics.html @@ -3,7 +3,7 @@ -capcruncher_statistics +capcruncher_output/statistics @@ -4853,7 +4853,7 @@ .bp3-running-text table th, table.bp3-html-table th{ color:#182026; font-weight:600; } - + .bp3-running-text table td, table.bp3-html-table td{ color:#182026; } @@ -13950,17 +13950,17 @@ }, displayAlign: 'center', CommonHTML: { - linebreaks: { - automatic: true + linebreaks: { + automatic: true } }, "HTML-CSS": { - linebreaks: { - automatic: true + linebreaks: { + automatic: true } } }); - + MathJax.Hub.Queue(["Typeset", MathJax.Hub]); } } @@ -14093,7 +14093,7 @@

Fastq duplication statisticsFastq duplication statistics