From 8d07c5e428a594f39f1d079af38eda183fbfd2b2 Mon Sep 17 00:00:00 2001 From: Ian Whaling <78115078+ian-whaling@users.noreply.github.com> Date: Wed, 8 Nov 2023 09:57:25 -0800 Subject: [PATCH] PIPE-112-MseI-regex (#186) --- docs/reference.md | 2 +- hic_pipeline/get_ligation_site_regex.py | 1 + scripts/make_input_json_from_portal.py | 48 +++++++++++++------ .../test_make_input_json_from_portal.py | 20 ++++---- 4 files changed, 46 insertions(+), 25 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index e7c52f96..2639fa67 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -101,7 +101,7 @@ Runs the pipeline starting with a `.hic` file for producing annotations. ] ] ``` -* `restriction_enzymes` is an array of names containing the restriction enzyme(s) used to generate the Hi-C libraries. Currently only `MboI`, `HindIII`, `DpnII`, and `none` are supported. `none` is useful for libraries like DNAse produced using a non-specific cutter. +* `restriction_enzymes` is an array of names containing the restriction enzyme(s) used to generate the Hi-C libraries. Currently only `MboI`, `HindIII`, `DpnII`, `MseI`, and `none` are supported. `none` is useful for libraries like DNAse produced using a non-specific cutter. * `ligation_site_regex` is a custom regular expression for counting ligation sites. If specified then `restriction_sites` file must be specified in the pipeline input. This can be just a single site, e.g. `ATGC`, or several sites wrapped in parentheses and separated by pipes, e.g. `(ATGC|CTAG)` (uses `grep -E` extended regular expression syntax) * `restriction_sites` is a gzipped text file containing cut sites for the given restriction enzyme. For supported enzymes you can generate this using the [reference building entrypoint](#generating-restriction-site-files). Note that if you need to generate a sites file for a multiple digest or for an unsupported enzyme you will need to edit this script and run it yourself: https://github.com/aidenlab/juicer/blob/encode/misc/generate_site_positions.py * `chrsz` is a chromosome sizes file for the desired assembly. It is a gzipped and tab-separated text file whose rows take the form `[chromosome][TAB][size]`. You can find these on the ENCODE portal for some human and mouse assemblies, see [reference files](#reference-files) diff --git a/hic_pipeline/get_ligation_site_regex.py b/hic_pipeline/get_ligation_site_regex.py index 67670529..375c9e8a 100755 --- a/hic_pipeline/get_ligation_site_regex.py +++ b/hic_pipeline/get_ligation_site_regex.py @@ -5,6 +5,7 @@ "HindIII": "AAGCTAGCTT", "DpnII": "GATCGATC", "MboI": "GATCGATC", + "MseI": "TTATAA", "none": "XXXX", } diff --git a/scripts/make_input_json_from_portal.py b/scripts/make_input_json_from_portal.py index ac4061a5..a56b1274 100644 --- a/scripts/make_input_json_from_portal.py +++ b/scripts/make_input_json_from_portal.py @@ -19,6 +19,12 @@ "MboI": urljoin( PORTAL_URL, "/files/ENCFF132WAM/@@download/ENCFF132WAM.txt.gz" ), + "MseI": urljoin( + PORTAL_URL, "/files/ENCFF558CCI/@@download/ENCFF558CCI.txt.gz" + ), + "MboI+MseI": urljoin( + PORTAL_URL, "/files/ENCFF275YUI/@@download/ENCFF275YUI.txt.gz" + ), }, "bwa_index": urljoin( PORTAL_URL, "/files/ENCFF643CGH/@@download/ENCFF643CGH.tar.gz" @@ -36,6 +42,12 @@ "MboI": urljoin( PORTAL_URL, "/files/ENCFF930KBK/@@download/ENCFF930KBK.txt.gz" ), + "MseI": urljoin( + PORTAL_URL, "/files/ENCFF416DZA/@@download/ENCFF416DZA.txt.gz" + ), + "MboI+MseI": urljoin( + PORTAL_URL, "/files/ENCFF708TJX/@@download/ENCFF708TJX.txt.gz" + ), }, "bwa_index": urljoin( PORTAL_URL, "/files/ENCFF018NEO/@@download/ENCFF018NEO.tar.gz" @@ -46,7 +58,7 @@ ), }, } -ENZYMES = ("HindIII", "DpnII", "MboI", "none") +ENZYMES = ("HindIII", "DpnII", "MboI", "MseI", "none") _NO_ENZYME_FRAGMENTATION_METHODS = ( "chemical (micrococcal nuclease)", "chemical (DNaseI)", @@ -107,21 +119,27 @@ def get_enzymes_from_experiment(experiment, enzymes=ENZYMES): for replicate in experiment["replicates"]: fragmentation_methods.extend(replicate["library"]["fragmentation_methods"]) fragmentation_methods = list(set(fragmentation_methods)) - if len(fragmentation_methods) > 1: - raise ValueError( - "Currently only experiments with one fragmentation method are supported" - ) - if fragmentation_methods[0] in _NO_ENZYME_FRAGMENTATION_METHODS: - return ["none"] - for enzyme in enzymes: - if enzyme in fragmentation_methods[0]: - used_enzymes.append(enzyme) - break - if not used_enzymes: + for fragmentation_method in fragmentation_methods: + if fragmentation_method in _NO_ENZYME_FRAGMENTATION_METHODS: + used_enzymes.append("none") + continue + for enzyme in enzymes: + if enzyme in fragmentation_method: + used_enzymes.append(enzyme) + break + if not any( + [used_enzyme in fragmentation_method for used_enzyme in used_enzymes] + ): + raise ValueError( + "Unsupported fragmentation method: {}".format(fragmentation_method) + ) + if any([used_enzyme == "none" for used_enzyme in used_enzymes]) and any( + [used_enzyme != "none" for used_enzyme in used_enzymes] + ): raise ValueError( - "Unsupported fragmentation method: {}".format(fragmentation_methods[0]) + "Unsupported fragmentation methods: both specific and non-specific cutters used." ) - return used_enzymes + return sorted(used_enzymes) def get_fastqs_from_experiment(experiment, read_group_num_path_parts=1): @@ -198,7 +216,7 @@ def get_input_json( if enzymes != ["none"]: input_json["hic.restriction_sites"] = REFERENCE_FILES[assembly_name][ "restriction_sites" - ][enzymes[0]] + ]["+".join(enzymes)] if ligation_site_regex is not None: input_json["hic.ligation_site_regex"] = ligation_site_regex diff --git a/tests/python/test_make_input_json_from_portal.py b/tests/python/test_make_input_json_from_portal.py index 61f942c4..83b47c09 100644 --- a/tests/python/test_make_input_json_from_portal.py +++ b/tests/python/test_make_input_json_from_portal.py @@ -88,15 +88,17 @@ def test_get_enzymes_from_experiment_mnase_returns_none_enzyme(): assert result == ["none"] -def test_get_enzymes_from_experiment_multiple_fragmentation_methods_raises(): - experiment = { - "replicates": [ - {"library": {"fragmentation_methods": ["chemical (MboI restriction)"]}}, - {"library": {"fragmentation_methods": ["chemical (MseI restriction)"]}}, - ] - } - with pytest.raises(ValueError): - get_enzymes_from_experiment(experiment, enzymes=["MboI", "MseI"]) +def test_get_enzymes_from_experiment_multiple_fragmentation_methods(): + result = get_enzymes_from_experiment( + { + "replicates": [ + {"library": {"fragmentation_methods": ["chemical (MboI restriction)"]}}, + {"library": {"fragmentation_methods": ["chemical (MseI restriction)"]}}, + ] + }, + enzymes=["MboI", "MseI"], + ) + assert result == ["MboI", "MseI"] def test_get_enzymes_from_experiment_unknown_fragmentation_methods_raises():