PIP-1538-use-assembly-name-for-pre (#97)

ENCODE-DCC · Apr 29, 2021 · 4940e88 · 4940e88
1 parent 5b28d2f
commit 4940e88
Show file tree

Hide file tree

Showing 13 changed files with 278 additions and 51 deletions.
diff --git a/docs/reference.md b/docs/reference.md
@@ -78,14 +78,16 @@ Use the WDL `make_restriction_site_locations.wdl` to generate the restriction si
 * `restriction_sites` is a text file containing cut sites for the given restriction enzyme. For supported enzymes you can generate this using the [reference building entrypoint](#generating-restriction-site-files). Note that if you need to generate a sites file for a multiple digest or for an unsupported enzyme you will need to edit this script and run it yourself: https://github.com/aidenlab/juicer/blob/encode/misc/generate_site_positions.py
 * `chrsz` is a chromosome sizes file for the desired assembly. It is a tab-separated text file whose rows take the form `[chromosome][TAB][size]`. You can find these on the ENCODE portal for some human and mouse assemblies, see [reference files](#reference-files)
 * `reference_index` is a pre-generated BWA index for the desired assembly. Depending on your assembly you may also be able to find these on the ENCODE portal, see [reference files](#reference-files)
+* `input_pairs` is a text file containing the paired fragments to use to generate the .hic contact maps, a detailed format description can be found here: https://github.com/aidenlab/juicer/wiki/Pre#long-format
+* `input_pairs_index` is an index of the `input_pairs` file as generated with `index_by_chr.awk` in task `bam_to_pre`"
 * `input_hic` is an input `.hic` file which will be used to call loops and domains
 * `normalization_methods` is an array of normalization methods to use for `.hic` file generation as per Juicer Tools `pre`. If not specified then will use `pre` defaults of `VC`, `VC_SQRT`, `KR`, and `SCALE`. Valid methods are `VC`, `VC_SQRT`, `KR`, `SCALE`, `GW_KR`, `GW_SCALE`, `GW_VC`, `INTER_KR`, `INTER_SCALE`, and `INTER_VC.
 * `reference_fasta` is FASTA file for the genome of interest to be used for generating restriction site locations. For the output locations file to have a descriptive filename it is also recommended to specify the `assembly_name`
 * `no_bam2pairs` is a boolean which if `true` results in skipping generating `.pairs` files, defaults to `false`
 * `no_call_loops` is a boolean which if `true` results in skipping calling loops, defaults to `false`. Since the loop calling requires GPUs it is recommended to set to `true` if you do not
 * `no_call_tads` is a boolean which if `true` skips calling domains with arrowhead, defaults to `false`
-* `cpu` is number of threads to use for `bwa` alignment, it is recommended to leave at the default value.
-* `assembly_name` is name of assembly to insert into hic file header, recommended to specify for reproducibility otherwise the resulting `.hic.` file may have variable data in the header (the matrix contents will still be the same).
+* `align_num_cpus` is number of threads to use for `bwa` alignment, it is recommended to leave at the default value.
+* `assembly_name` is name of assembly, defaults to "unknown". If the assembly is supported by Juicer Tools `pre` then `.hic` file creation will use Juicer Tools' internal chrom sizes instead of the inputted `chrsz`, see [`Pre` documentation](https://github.com/aidenlab/juicer/wiki/Pre#usage) for list of supported values. The pipeline does some normalization of this value internally, for instance `GRCh38` will be converted into the Juicer Tools-supported `hg38`.
 
 ### Reference files
 

diff --git a/hic.wdl b/hic.wdl
@@ -13,9 +13,9 @@ struct BamAndLigationCount {
 
 workflow hic {
     meta {
-        version: "0.3.0"
-        caper_docker: "encodedcc/hic-pipeline:0.3.0"
-        caper_singularity: "docker://encodedcc/hic-pipeline:0.3.0"
+        version: "0.4.0"
+        caper_docker: "encodedcc/hic-pipeline:0.4.0"
+        caper_singularity: "docker://encodedcc/hic-pipeline:0.4.0"
         croo_out_def: "https://raw.githubusercontent.com/ENCODE-DCC/hic-pipeline/dev/croo_out_def.json"
     }
 
@@ -40,25 +40,7 @@ workflow hic {
         Boolean no_call_loops = false
         Boolean no_call_tads = false
         Int align_num_cpus = 32
-        String? assembly_name
-    }
-
-    parameter_meta {
-        fastq: "Twice nested array of input `FastqPair`s, takes form of [lib_id][fastq_id]"
-        restriction_enzymes: "An array of names containing the restriction enzyme(s) used to generate the Hi-C libraries"
-        restriction_sites: "A text file containing cut sites for the given restriction enzyme. You should generate this file using this script: https://github.com/aidenlab/juicer/blob/encode/misc/generate_site_positions.py"
-        ligation_site_regex: "A custom regex to use for counting ligation site, if specified then restriction_sites file must be manually specified. Can be just a single site, e.g. ATGC, or several sites wrapped in parentheses and separated by pipes, e.g. `(ATGC|CTAG)`"
-        chrsz: "A chromosome sizes file for the desired assembly, this is a tab-separated text file whose rows take the form [chromosome] [size]"
-        reference_index: "A pregenerated BWA index for the desired assembly"
-        normalization_methods: "An array of normalization methods to use for .hic file generation as per Juicer Tools `pre`, if not specified then will use `pre` defaults of VC, VC_SQRT, KR, and SCALE. Valid methods are VC, VC_SQRT, KR, SCALE, GW_KR, GW_SCALE, GW_VC, INTER_KR, INTER_SCALE, and INTER_VC."
-        input_pairs: "A text file containing the paired fragments to use to generate the .hic contact maps, a detailed format description can be found here: https://github.com/aidenlab/juicer/wiki/Pre#long-format"
-        input_pairs_index: "Index of input_pairs as generated with index_by_chr.awk in task bam_to_pre"
-        input_hic: "An input .hic file for which to call loops and domains"
-        no_bam2pairs: "If set to `true`, avoid generating .pairs files, defaults to false"
-        no_call_loops: "If set to `true`, avoid calling loops with hiccups, defaults to false"
-        no_call_tads: "If set to `true`, avoid calling domains with arrowhead, defaults to false"
-        align_num_cpus: "Number of threads to use for bwa alignment"
-        assembly_name: "Name of assembly to insert into hic file header, recommended to specify for reproducbility otherwise hic file will be nondeterministic"
+        String assembly_name = "undefined"
     }
 
     # Default MAPQ thresholds for generating .hic contact maps
@@ -78,6 +60,10 @@ workflow hic {
         }
     }
 
+    call normalize_assembly_name { input:
+        assembly_name = assembly_name
+    }
+
     scatter(i in range(length(fastq))) {
         Array[FastqPair] replicate = fastq[i]
         scatter(fastq_pair in replicate) {
@@ -179,33 +165,48 @@ workflow hic {
             quality = qualities[i],
         }
 
-        call create_hic { input:
-            pre = select_first([input_pairs, bam_to_pre.pre]),
-            pre_index = select_first([input_pairs_index, bam_to_pre.index]),
-            chrsz = select_first([chrsz]),
-            restriction_sites = restriction_sites,
-            quality = qualities[i],
-            stats = calculate_stats.stats,
-            stats_hists = calculate_stats.stats_hists,
-            assembly_name = assembly_name,
-            normalization_methods = normalization_methods,
-        }
-    }
-
-    if ((defined(input_hic) || defined(create_hic.output_hic))) {
-        File hic_file = if defined(input_hic) then select_first([input_hic]) else create_hic.output_hic[1]
-        if (!no_call_tads) {
-            call arrowhead { input:
-                hic_file = hic_file
+        # If Juicer Tools doesn't support the assembly then need to pass chrom sizes
+        if (!normalize_assembly_name.assembly_is_supported) {
+            call create_hic as create_hic_with_chrom_sizes { input:
+                pre = select_first([input_pairs, bam_to_pre.pre]),
+                pre_index = select_first([input_pairs_index, bam_to_pre.index]),
+                chrsz = select_first([chrsz]),
+                restriction_sites = restriction_sites,
+                quality = qualities[i],
+                stats = calculate_stats.stats,
+                stats_hists = calculate_stats.stats_hists,
+                assembly_name = assembly_name,
+                normalization_methods = normalization_methods,
             }
         }
-        if (!no_call_loops) {
-            call hiccups { input:
-                hic_file = hic_file
+
+        if (normalize_assembly_name.assembly_is_supported) {
+            call create_hic { input:
+                pre = select_first([input_pairs, bam_to_pre.pre]),
+                pre_index = select_first([input_pairs_index, bam_to_pre.index]),
+                restriction_sites = restriction_sites,
+                quality = qualities[i],
+                stats = calculate_stats.stats,
+                stats_hists = calculate_stats.stats_hists,
+                assembly_name = assembly_name,
+                normalization_methods = normalization_methods,
             }
         }
     }
 
+    File hic_file = select_first(
+        [input_hic, create_hic.output_hic[1], create_hic_with_chrom_sizes.output_hic[1]]
+    )
+    if (!no_call_tads) {
+        call arrowhead { input:
+            hic_file = hic_file
+        }
+    }
+    if (!no_call_loops) {
+        call hiccups { input:
+            hic_file = hic_file
+        }
+    }
 }
 
 task get_ligation_site_regex {
@@ -234,6 +235,32 @@ task get_ligation_site_regex {
     }
 }
 
+task normalize_assembly_name {
+    input {
+        String assembly_name
+        String normalized_assembly_name_output_path = "normalized_assembly_name.txt"
+        String assembly_is_supported_output_path = "is_supported.txt"
+    }
+
+    command <<<
+        set -euo pipefail
+        python3 "$(which normalize_assembly_name.py)" \
+            ~{assembly_name} \
+            ~{normalized_assembly_name_output_path} \
+            ~{assembly_is_supported_output_path}
+    >>>
+
+    output {
+        String normalized_assembly_name = read_string("~{normalized_assembly_name_output_path}")
+        Boolean assembly_is_supported = read_boolean("~{assembly_is_supported_output_path}")
+    }
+
+    runtime {
+        cpu : "1"
+        memory: "500 MB"
+    }
+}
+
 task align {
     input {
         FastqPair fastq_pair
@@ -642,7 +669,7 @@ task hiccups {
         cpu : "1"
         bootDiskSizeGb: "20"
         disks: "local-disk 100 SSD"
-        docker: "encodedcc/hic-pipeline:0.3.0_hiccups"
+        docker: "encodedcc/hic-pipeline:0.4.0_hiccups"
         gpuType: "nvidia-tesla-p100"
         gpuCount: 1
         memory: "8 GB"

diff --git a/hic_pipeline/__init__.py b/hic_pipeline/__init__.py
@@ -1,5 +1,5 @@
 __title__ = "hic-pipeline"
-__version__ = "0.3.0"
+__version__ = "0.4.0"
 __description__ = "ENCODE Hi-C uniform processing pipeline."
 __url__ = "https://github.com/ENCODE-DCC/hic-pipeline"
 __uri__ = __url__

diff --git a/hic_pipeline/normalize_assembly_name.py b/hic_pipeline/normalize_assembly_name.py
@@ -0,0 +1,89 @@
+import argparse
+
+VALID_ASSEMBLIES = (
+    "hg18",
+    "hg19",
+    "hg38",
+    "dMel",
+    "mm9",
+    "mm10",
+    "anasPlat1",
+    "bTaurus3",
+    "canFam3",
+    "equCab2",
+    "galGal4",
+    "Pf3D7",
+    "sacCer3",
+    "sCerS288c",
+    "susScr3",
+    "TAIR10",
+)
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    normalized_name, assembly_is_supported = normalize_assembly_name(args.assembly_name)
+    write_string_to_file(normalized_name, args.normalized_name_outfile)
+    write_string_to_file(
+        get_wdl_boolean_string(assembly_is_supported),
+        args.assembly_is_supported_outfile,
+    )
+
+
+def normalize_assembly_name(assembly_name):
+    """
+    Returns a tuple of the possibly normalized assembly name and a boolean indicating
+    whether or not the assembly is supported.
+    """
+    assembly_name = assembly_name.lower()
+    if "grch" in assembly_name:
+        assembly_name = normalize_grch_name(assembly_name)
+    for canonical_name in VALID_ASSEMBLIES:
+        if canonical_name.lower() == assembly_name:
+            return canonical_name, True
+    return assembly_name, False
+
+
+def normalize_grch_name(assembly_name):
+    """
+    Convert `GRCh` names to `hg` ones, GRCh38 = hg38, GRCh37 = hg19, and GRCh36 = hg18
+    """
+    assembly_version = int(assembly_name.lower().replace("grch", ""))
+    if assembly_version < 38:
+        hg_assembly_version = assembly_version - 18
+    else:
+        hg_assembly_version = assembly_version
+    return "hg" + str(hg_assembly_version)
+
+
+def get_wdl_boolean_string(boolean):
+    """
+    WDL expects `true` or `false` strings for `read_boolean`, Python `str` doesn't work
+    """
+    return str(boolean).lower()
+
+
+def write_string_to_file(data, filename):
+    with open(filename, "w") as f:
+        f.write(data)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("assembly_name", help="Assembly name")
+    parser.add_argument(
+        "normalized_name_outfile", help="Name of file to write normalized name"
+    )
+    parser.add_argument(
+        "assembly_is_supported_outfile",
+        help=(
+            "Name for file to write boolean indicating if the assembly is supported by "
+            "Juicer"
+        ),
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    main()
diff --git a/make_restriction_site_locations.wdl b/make_restriction_site_locations.wdl
@@ -2,9 +2,9 @@ version 1.0
 
 workflow make_restriction_site_locations {
     meta {
-        version: "0.3.0"
-        caper_docker: "encodedcc/hic-pipeline:0.3.0"
-        caper_singularity: "docker://encodedcc/hic-pipeline:0.3.0"
+        version: "0.4.0"
+        caper_docker: "encodedcc/hic-pipeline:0.4.0"
+        caper_singularity: "docker://encodedcc/hic-pipeline:0.4.0"
     }
 
     parameter_meta {

diff --git a/tests/functional/test_multiple_libraries.py b/tests/functional/test_multiple_libraries.py
@@ -6,7 +6,9 @@
 @pytest.mark.workflow("test_multiple_libraries")
 def test_multiple_libraries_hic_match(workflow_dir):
     hic_path = next(
-        workflow_dir.glob("hic/*/call-create_hic/shard-1/execution/inter_30.hic")
+        workflow_dir.glob(
+            "hic/*/call-create_hic_with_chrom_sizes/shard-1/execution/inter_30.hic"
+        )
     )
     hic_md5sum = hashlib.md5(hic_path.read_bytes()).hexdigest()
     assert hic_md5sum == "0a65447aecd388044978367ce5e5ae97"
diff --git a/tests/integration/json/test_normalize_assembly_name.json b/tests/integration/json/test_normalize_assembly_name.json
@@ -0,0 +1,5 @@
+{
+  "test_normalize_assembly_name.assembly_is_supported_output_path": "is_supported.txt",
+  "test_normalize_assembly_name.assembly_name": "GRCh38",
+  "test_normalize_assembly_name.normalized_assembly_name_output_path": "normalized.txt"
+}
diff --git a/tests/integration/test_normalize_assembly_name.yaml b/tests/integration/test_normalize_assembly_name.yaml
@@ -0,0 +1,13 @@
+---
+  - name: test_normalize_assembly_name
+    tags:
+      - integration
+    command: >-
+      tests/caper_run.sh
+      tests/integration/wdl/test_normalize_assembly_name.wdl
+      tests/integration/json/test_normalize_assembly_name.json
+    # Cannot easily check the file contents, since not copied to test-output
+    stdout:
+      contains:
+        - '"test_normalize_assembly_name.normalize_assembly_name.normalized_assembly_name": "hg38"'
+        - '"test_normalize_assembly_name.normalize_assembly_name.assembly_is_supported": true'
diff --git a/tests/integration/wdl/test_normalize_assembly_name.wdl b/tests/integration/wdl/test_normalize_assembly_name.wdl
@@ -0,0 +1,17 @@
+version 1.0
+
+import "../../../hic.wdl" as hic
+
+workflow test_normalize_assembly_name {
+    input {
+        String assembly_name
+        String normalized_assembly_name_output_path
+        String assembly_is_supported_output_path
+    }
+
+    call hic.normalize_assembly_name { input:
+        assembly_name = assembly_name,
+        normalized_assembly_name_output_path = normalized_assembly_name_output_path,
+        assembly_is_supported_output_path = assembly_is_supported_output_path,
+    }
+}
diff --git a/tests/python/test_normalize_assembly_name.py b/tests/python/test_normalize_assembly_name.py
@@ -0,0 +1,37 @@
+import pytest
+
+from hic_pipeline.normalize_assembly_name import (
+    get_wdl_boolean_string,
+    normalize_assembly_name,
+    normalize_grch_name,
+)
+
+
+@pytest.mark.parametrize(
+    "assembly_name,expected",
+    [
+        ("GRCh38", ("hg38", True)),
+        ("hg19", ("hg19", True)),
+        ("GRCh37", ("hg19", True)),
+        ("grch36", ("hg18", True)),
+        ("unknown", ("unknown", False)),
+    ],
+)
+def test_normalize_assembly_name(assembly_name, expected):
+    result = normalize_assembly_name(assembly_name)
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "assembly_name,expected",
+    [("GRCh38", "hg38"), ("GRCh37", "hg19"), ("GRCh36", "hg18")],
+)
+def test_normalize_grch_name_name(assembly_name, expected):
+    result = normalize_grch_name(assembly_name)
+    assert result == expected
+
+
+@pytest.mark.parametrize("boolean,expected", [(True, "true"), (False, "false")])
+def test_get_wdl_boolean_string(boolean, expected):
+    result = get_wdl_boolean_string(boolean)
+    assert result == expected
diff --git a/tests/unit/json/test_normalize_assembly_name.json b/tests/unit/json/test_normalize_assembly_name.json
@@ -0,0 +1,5 @@
+{
+  "test_normalize_assembly_name.assembly_is_supported_output_path": "bar.txt",
+  "test_normalize_assembly_name.assembly_name": "hg38",
+  "test_normalize_assembly_name.normalized_assembly_name_output_path": "foo.txt"
+}
diff --git a/tests/unit/test_normalize_assembly_name.yaml b/tests/unit/test_normalize_assembly_name.yaml
@@ -0,0 +1,13 @@
+---
+  - name: test_normalize_assembly_name_unit
+    tags:
+      - unit
+    command: >-
+      tests/caper_run.sh
+      tests/unit/wdl/test_normalize_assembly_name.wdl
+      tests/unit/json/test_normalize_assembly_name.json
+    stdout:
+      contains:
+        - "hg38"
+        - "foo.txt"
+        - "bar.txt"