Skip to content

Commit

Permalink
PIPE-112-MseI-regex (#186)
Browse files Browse the repository at this point in the history
  • Loading branch information
ian-whaling authored Nov 8, 2023
1 parent 33d33af commit 8d07c5e
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 25 deletions.
2 changes: 1 addition & 1 deletion docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ Runs the pipeline starting with a `.hic` file for producing annotations.
]
]
```
* `restriction_enzymes` is an array of names containing the restriction enzyme(s) used to generate the Hi-C libraries. Currently only `MboI`, `HindIII`, `DpnII`, and `none` are supported. `none` is useful for libraries like DNAse produced using a non-specific cutter.
* `restriction_enzymes` is an array of names containing the restriction enzyme(s) used to generate the Hi-C libraries. Currently only `MboI`, `HindIII`, `DpnII`, `MseI`, and `none` are supported. `none` is useful for libraries like DNAse produced using a non-specific cutter.
* `ligation_site_regex` is a custom regular expression for counting ligation sites. If specified then `restriction_sites` file must be specified in the pipeline input. This can be just a single site, e.g. `ATGC`, or several sites wrapped in parentheses and separated by pipes, e.g. `(ATGC|CTAG)` (uses `grep -E` extended regular expression syntax)
* `restriction_sites` is a gzipped text file containing cut sites for the given restriction enzyme. For supported enzymes you can generate this using the [reference building entrypoint](#generating-restriction-site-files). Note that if you need to generate a sites file for a multiple digest or for an unsupported enzyme you will need to edit this script and run it yourself: https://github.com/aidenlab/juicer/blob/encode/misc/generate_site_positions.py
* `chrsz` is a chromosome sizes file for the desired assembly. It is a gzipped and tab-separated text file whose rows take the form `[chromosome][TAB][size]`. You can find these on the ENCODE portal for some human and mouse assemblies, see [reference files](#reference-files)
Expand Down
1 change: 1 addition & 0 deletions hic_pipeline/get_ligation_site_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"HindIII": "AAGCTAGCTT",
"DpnII": "GATCGATC",
"MboI": "GATCGATC",
"MseI": "TTATAA",
"none": "XXXX",
}

Expand Down
48 changes: 33 additions & 15 deletions scripts/make_input_json_from_portal.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
"MboI": urljoin(
PORTAL_URL, "/files/ENCFF132WAM/@@download/ENCFF132WAM.txt.gz"
),
"MseI": urljoin(
PORTAL_URL, "/files/ENCFF558CCI/@@download/ENCFF558CCI.txt.gz"
),
"MboI+MseI": urljoin(
PORTAL_URL, "/files/ENCFF275YUI/@@download/ENCFF275YUI.txt.gz"
),
},
"bwa_index": urljoin(
PORTAL_URL, "/files/ENCFF643CGH/@@download/ENCFF643CGH.tar.gz"
Expand All @@ -36,6 +42,12 @@
"MboI": urljoin(
PORTAL_URL, "/files/ENCFF930KBK/@@download/ENCFF930KBK.txt.gz"
),
"MseI": urljoin(
PORTAL_URL, "/files/ENCFF416DZA/@@download/ENCFF416DZA.txt.gz"
),
"MboI+MseI": urljoin(
PORTAL_URL, "/files/ENCFF708TJX/@@download/ENCFF708TJX.txt.gz"
),
},
"bwa_index": urljoin(
PORTAL_URL, "/files/ENCFF018NEO/@@download/ENCFF018NEO.tar.gz"
Expand All @@ -46,7 +58,7 @@
),
},
}
ENZYMES = ("HindIII", "DpnII", "MboI", "none")
ENZYMES = ("HindIII", "DpnII", "MboI", "MseI", "none")
_NO_ENZYME_FRAGMENTATION_METHODS = (
"chemical (micrococcal nuclease)",
"chemical (DNaseI)",
Expand Down Expand Up @@ -107,21 +119,27 @@ def get_enzymes_from_experiment(experiment, enzymes=ENZYMES):
for replicate in experiment["replicates"]:
fragmentation_methods.extend(replicate["library"]["fragmentation_methods"])
fragmentation_methods = list(set(fragmentation_methods))
if len(fragmentation_methods) > 1:
raise ValueError(
"Currently only experiments with one fragmentation method are supported"
)
if fragmentation_methods[0] in _NO_ENZYME_FRAGMENTATION_METHODS:
return ["none"]
for enzyme in enzymes:
if enzyme in fragmentation_methods[0]:
used_enzymes.append(enzyme)
break
if not used_enzymes:
for fragmentation_method in fragmentation_methods:
if fragmentation_method in _NO_ENZYME_FRAGMENTATION_METHODS:
used_enzymes.append("none")
continue
for enzyme in enzymes:
if enzyme in fragmentation_method:
used_enzymes.append(enzyme)
break
if not any(
[used_enzyme in fragmentation_method for used_enzyme in used_enzymes]
):
raise ValueError(
"Unsupported fragmentation method: {}".format(fragmentation_method)
)
if any([used_enzyme == "none" for used_enzyme in used_enzymes]) and any(
[used_enzyme != "none" for used_enzyme in used_enzymes]
):
raise ValueError(
"Unsupported fragmentation method: {}".format(fragmentation_methods[0])
"Unsupported fragmentation methods: both specific and non-specific cutters used."
)
return used_enzymes
return sorted(used_enzymes)


def get_fastqs_from_experiment(experiment, read_group_num_path_parts=1):
Expand Down Expand Up @@ -198,7 +216,7 @@ def get_input_json(
if enzymes != ["none"]:
input_json["hic.restriction_sites"] = REFERENCE_FILES[assembly_name][
"restriction_sites"
][enzymes[0]]
]["+".join(enzymes)]

if ligation_site_regex is not None:
input_json["hic.ligation_site_regex"] = ligation_site_regex
Expand Down
20 changes: 11 additions & 9 deletions tests/python/test_make_input_json_from_portal.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,17 @@ def test_get_enzymes_from_experiment_mnase_returns_none_enzyme():
assert result == ["none"]


def test_get_enzymes_from_experiment_multiple_fragmentation_methods_raises():
experiment = {
"replicates": [
{"library": {"fragmentation_methods": ["chemical (MboI restriction)"]}},
{"library": {"fragmentation_methods": ["chemical (MseI restriction)"]}},
]
}
with pytest.raises(ValueError):
get_enzymes_from_experiment(experiment, enzymes=["MboI", "MseI"])
def test_get_enzymes_from_experiment_multiple_fragmentation_methods():
result = get_enzymes_from_experiment(
{
"replicates": [
{"library": {"fragmentation_methods": ["chemical (MboI restriction)"]}},
{"library": {"fragmentation_methods": ["chemical (MseI restriction)"]}},
]
},
enzymes=["MboI", "MseI"],
)
assert result == ["MboI", "MseI"]


def test_get_enzymes_from_experiment_unknown_fragmentation_methods_raises():
Expand Down

0 comments on commit 8d07c5e

Please sign in to comment.