From 8d07c5e428a594f39f1d079af38eda183fbfd2b2 Mon Sep 17 00:00:00 2001
From: Ian Whaling <78115078+ian-whaling@users.noreply.github.com>
Date: Wed, 8 Nov 2023 09:57:25 -0800
Subject: [PATCH] PIPE-112-MseI-regex (#186)

---
 docs/reference.md                             |  2 +-
 hic_pipeline/get_ligation_site_regex.py       |  1 +
 scripts/make_input_json_from_portal.py        | 48 +++++++++++++------
 .../test_make_input_json_from_portal.py       | 20 ++++----
 4 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/docs/reference.md b/docs/reference.md
index e7c52f96..2639fa67 100644
--- a/docs/reference.md
+++ b/docs/reference.md
@@ -101,7 +101,7 @@ Runs the pipeline starting with a `.hic` file for producing annotations.
   ]
 ]
 ```
-* `restriction_enzymes` is an array of names containing the restriction enzyme(s) used to generate the Hi-C libraries. Currently only `MboI`, `HindIII`, `DpnII`, and `none` are supported. `none` is useful for libraries like DNAse produced using a non-specific cutter.
+* `restriction_enzymes` is an array of names containing the restriction enzyme(s) used to generate the Hi-C libraries. Currently only `MboI`, `HindIII`, `DpnII`, `MseI`, and `none` are supported. `none` is useful for libraries like DNAse produced using a non-specific cutter.
 * `ligation_site_regex` is a custom regular expression for counting ligation sites. If specified then `restriction_sites` file must be specified in the pipeline input. This can be just a single site, e.g. `ATGC`, or several sites wrapped in parentheses and separated by pipes, e.g. `(ATGC|CTAG)` (uses `grep -E` extended regular expression syntax)
 * `restriction_sites` is a gzipped text file containing cut sites for the given restriction enzyme. For supported enzymes you can generate this using the [reference building entrypoint](#generating-restriction-site-files). Note that if you need to generate a sites file for a multiple digest or for an unsupported enzyme you will need to edit this script and run it yourself: https://github.com/aidenlab/juicer/blob/encode/misc/generate_site_positions.py
 * `chrsz` is a chromosome sizes file for the desired assembly. It is a gzipped and tab-separated text file whose rows take the form `[chromosome][TAB][size]`. You can find these on the ENCODE portal for some human and mouse assemblies, see [reference files](#reference-files)
diff --git a/hic_pipeline/get_ligation_site_regex.py b/hic_pipeline/get_ligation_site_regex.py
index 67670529..375c9e8a 100755
--- a/hic_pipeline/get_ligation_site_regex.py
+++ b/hic_pipeline/get_ligation_site_regex.py
@@ -5,6 +5,7 @@
     "HindIII": "AAGCTAGCTT",
     "DpnII": "GATCGATC",
     "MboI": "GATCGATC",
+    "MseI": "TTATAA",
     "none": "XXXX",
 }
 
diff --git a/scripts/make_input_json_from_portal.py b/scripts/make_input_json_from_portal.py
index ac4061a5..a56b1274 100644
--- a/scripts/make_input_json_from_portal.py
+++ b/scripts/make_input_json_from_portal.py
@@ -19,6 +19,12 @@
             "MboI": urljoin(
                 PORTAL_URL, "/files/ENCFF132WAM/@@download/ENCFF132WAM.txt.gz"
             ),
+            "MseI": urljoin(
+                PORTAL_URL, "/files/ENCFF558CCI/@@download/ENCFF558CCI.txt.gz"
+            ),
+            "MboI+MseI": urljoin(
+                PORTAL_URL, "/files/ENCFF275YUI/@@download/ENCFF275YUI.txt.gz"
+            ),
         },
         "bwa_index": urljoin(
             PORTAL_URL, "/files/ENCFF643CGH/@@download/ENCFF643CGH.tar.gz"
@@ -36,6 +42,12 @@
             "MboI": urljoin(
                 PORTAL_URL, "/files/ENCFF930KBK/@@download/ENCFF930KBK.txt.gz"
             ),
+            "MseI": urljoin(
+                PORTAL_URL, "/files/ENCFF416DZA/@@download/ENCFF416DZA.txt.gz"
+            ),
+            "MboI+MseI": urljoin(
+                PORTAL_URL, "/files/ENCFF708TJX/@@download/ENCFF708TJX.txt.gz"
+            ),
         },
         "bwa_index": urljoin(
             PORTAL_URL, "/files/ENCFF018NEO/@@download/ENCFF018NEO.tar.gz"
@@ -46,7 +58,7 @@
         ),
     },
 }
-ENZYMES = ("HindIII", "DpnII", "MboI", "none")
+ENZYMES = ("HindIII", "DpnII", "MboI", "MseI", "none")
 _NO_ENZYME_FRAGMENTATION_METHODS = (
     "chemical (micrococcal nuclease)",
     "chemical (DNaseI)",
@@ -107,21 +119,27 @@ def get_enzymes_from_experiment(experiment, enzymes=ENZYMES):
     for replicate in experiment["replicates"]:
         fragmentation_methods.extend(replicate["library"]["fragmentation_methods"])
     fragmentation_methods = list(set(fragmentation_methods))
-    if len(fragmentation_methods) > 1:
-        raise ValueError(
-            "Currently only experiments with one fragmentation method are supported"
-        )
-    if fragmentation_methods[0] in _NO_ENZYME_FRAGMENTATION_METHODS:
-        return ["none"]
-    for enzyme in enzymes:
-        if enzyme in fragmentation_methods[0]:
-            used_enzymes.append(enzyme)
-            break
-    if not used_enzymes:
+    for fragmentation_method in fragmentation_methods:
+        if fragmentation_method in _NO_ENZYME_FRAGMENTATION_METHODS:
+            used_enzymes.append("none")
+            continue
+        for enzyme in enzymes:
+            if enzyme in fragmentation_method:
+                used_enzymes.append(enzyme)
+                break
+        if not any(
+            [used_enzyme in fragmentation_method for used_enzyme in used_enzymes]
+        ):
+            raise ValueError(
+                "Unsupported fragmentation method: {}".format(fragmentation_method)
+            )
+    if any([used_enzyme == "none" for used_enzyme in used_enzymes]) and any(
+        [used_enzyme != "none" for used_enzyme in used_enzymes]
+    ):
         raise ValueError(
-            "Unsupported fragmentation method: {}".format(fragmentation_methods[0])
+            "Unsupported fragmentation methods: both specific and non-specific cutters used."
         )
-    return used_enzymes
+    return sorted(used_enzymes)
 
 
 def get_fastqs_from_experiment(experiment, read_group_num_path_parts=1):
@@ -198,7 +216,7 @@ def get_input_json(
         if enzymes != ["none"]:
             input_json["hic.restriction_sites"] = REFERENCE_FILES[assembly_name][
                 "restriction_sites"
-            ][enzymes[0]]
+            ]["+".join(enzymes)]
 
     if ligation_site_regex is not None:
         input_json["hic.ligation_site_regex"] = ligation_site_regex
diff --git a/tests/python/test_make_input_json_from_portal.py b/tests/python/test_make_input_json_from_portal.py
index 61f942c4..83b47c09 100644
--- a/tests/python/test_make_input_json_from_portal.py
+++ b/tests/python/test_make_input_json_from_portal.py
@@ -88,15 +88,17 @@ def test_get_enzymes_from_experiment_mnase_returns_none_enzyme():
     assert result == ["none"]
 
 
-def test_get_enzymes_from_experiment_multiple_fragmentation_methods_raises():
-    experiment = {
-        "replicates": [
-            {"library": {"fragmentation_methods": ["chemical (MboI restriction)"]}},
-            {"library": {"fragmentation_methods": ["chemical (MseI restriction)"]}},
-        ]
-    }
-    with pytest.raises(ValueError):
-        get_enzymes_from_experiment(experiment, enzymes=["MboI", "MseI"])
+def test_get_enzymes_from_experiment_multiple_fragmentation_methods():
+    result = get_enzymes_from_experiment(
+        {
+            "replicates": [
+                {"library": {"fragmentation_methods": ["chemical (MboI restriction)"]}},
+                {"library": {"fragmentation_methods": ["chemical (MseI restriction)"]}},
+            ]
+        },
+        enzymes=["MboI", "MseI"],
+    )
+    assert result == ["MboI", "MseI"]
 
 
 def test_get_enzymes_from_experiment_unknown_fragmentation_methods_raises():