Use standard genepred file in ROSE

nf-core · Apr 25, 2024 · cd1cb9a · cd1cb9a
1 parent 7c63762
commit cd1cb9a
Show file tree

Hide file tree

Showing 9 changed files with 154 additions and 35 deletions.
diff --git a/conf/modules.config b/conf/modules.config
@@ -62,6 +62,10 @@ process {
         ext.prefix = {"${meta.id}_control"}
     }
 
+    withName: UCSC_GTFTOGENEPRED {
+        ext.args = "-genePredExt"
+    }
+
     withName: ".*DYNAMITE:FILTER" {
         ext.args = {"'BEGIN{OFS=\"\\t\"} NR==1 || (\$2 >= ${params.dynamite_min_regression} || \$2 <= -${params.dynamite_min_regression} )'"}
         ext.prefix = {"${meta.id}.filtered"}

diff --git a/modules.json b/modules.json
@@ -49,6 +49,11 @@
                         "branch": "master",
                         "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62",
                         "installed_by": ["modules"]
+                    },
+                    "ucsc/gtftogenepred": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
                     }
                 }
             },

diff --git a/modules/local/rose/main.nf b/modules/local/rose/main.nf
@@ -9,7 +9,7 @@ process ROSE {
 
     input:
     tuple val(meta), path(bed)
-    path ucsc_file
+    tuple val(meta2), path(genepred)
 
     output:
     tuple val(meta), path("${meta.id}.rose.bed"), emit: stitched

diff --git a/modules/local/rose/templates/rose.py b/modules/local/rose/templates/rose.py
@@ -166,33 +166,31 @@ def format_folder(folder_name, create=False):
 # ==================================================================
 
 
-def make_start_dict(annot_file, gene_list=[]):
+def make_start_dict(annot_file):
     """
     makes a dictionary keyed by refseq ID that contains information about
     chrom/start/stop/strand/common name
     """
 
-    if type(gene_list) == str:
-        gene_list = parse_table(gene_list, '\\t')
-        gene_list = [line[0] for line in gene_list]
-
-    if annot_file.upper().count('REFSEQ') == 1:
-        refseq_table, refseq_dict = import_refseq(annot_file)
-        if len(gene_list) == 0:
-            gene_list = list(refseq_dict.keys())
-        start_dict = {}
-        for gene in gene_list:
-            if gene not in refseq_dict:
-                continue
-            start_dict[gene] = {}
-            start_dict[gene]['sense'] = refseq_table[refseq_dict[gene][0]][3]
-            start_dict[gene]['chr'] = refseq_table[refseq_dict[gene][0]][2]
-            start_dict[gene]['start'] = get_tsss([gene], refseq_table, refseq_dict)
-            if start_dict[gene]['sense'] == '+':
-                start_dict[gene]['end'] = [int(refseq_table[refseq_dict[gene][0]][5])]
-            else:
-                start_dict[gene]['end'] = [int(refseq_table[refseq_dict[gene][0]][4])]
-            start_dict[gene]['name'] = refseq_table[refseq_dict[gene][0]][12]
+    transcripts = []
+
+    refseq_table, refseq_dict = import_refseq(annot_file)
+    if len(transcripts) == 0:
+        transcripts = list(refseq_dict.keys())
+    start_dict = {}
+    for transcript in transcripts:
+        if transcript not in refseq_dict:
+            continue
+        start_dict[transcript] = {}
+        start_dict[transcript]['sense'] = refseq_table[refseq_dict[transcript][0]][2]
+        start_dict[transcript]['chr'] = refseq_table[refseq_dict[transcript][0]][1]
+        start_dict[transcript]['start'] = get_tsss([transcript], refseq_table, refseq_dict)
+        if start_dict[transcript]['sense'] == '+':
+            start_dict[transcript]['end'] = [int(refseq_table[refseq_dict[transcript][0]][4])]
+        else:
+            start_dict[transcript]['end'] = [int(refseq_table[refseq_dict[transcript][0]][3])]
+        start_dict[transcript]['name'] = refseq_table[refseq_dict[transcript][0]][11]
+
     return start_dict
 
 
@@ -204,10 +202,10 @@ def get_tsss(gene_list, refseq_table, refseq_dict):
         refseq = refseq_from_key(gene_list, refseq_dict, refseq_table)
     tss = []
     for line in refseq:
-        if line[3] == '+':
+        if line[2] == '+':
+            tss.append(line[3])
+        if line[2] == '-':
             tss.append(line[4])
-        if line[3] == '-':
-            tss.append(line[5])
     tss = list(map(int, tss))
 
     return tss
@@ -234,12 +232,13 @@ def import_refseq(refseq_file, return_multiples=False):
     """
     refseq_table = parse_table(refseq_file, '\\t')
     refseq_dict = {}
-    ticker = 1
-    for line in refseq_table[1:]:
-        if line[1] in refseq_dict:
-            refseq_dict[line[1]].append(ticker)
+    ticker = 0
+    for line in refseq_table:
+        transcript = line[0]
+        if transcript in refseq_dict:
+            refseq_dict[transcript].append(ticker)
         else:
-            refseq_dict[line[1]] = [ticker]
+            refseq_dict[transcript] = [ticker]
         ticker = ticker + 1
 
     multiples = []
@@ -616,7 +615,7 @@ def idfun(x): return x
     return result
 
 
-start_dict = make_start_dict("$ucsc_file")
+start_dict = make_start_dict("$genepred")
 locus_collection = bed_to_locus_collection("$bed")
 stitched_collection = region_stitching(locus_collection, int("$stitch"), int("$tss_dist"), start_dict)
 stitched = locus_collection_to_bed(stitched_collection)

diff --git a/modules/nf-core/ucsc/gtftogenepred/environment.yml b/modules/nf-core/ucsc/gtftogenepred/environment.yml
diff --git a/modules/nf-core/ucsc/gtftogenepred/main.nf b/modules/nf-core/ucsc/gtftogenepred/main.nf
diff --git a/modules/nf-core/ucsc/gtftogenepred/meta.yml b/modules/nf-core/ucsc/gtftogenepred/meta.yml
diff --git a/subworkflows/local/peaks.nf b/subworkflows/local/peaks.nf
@@ -64,7 +64,7 @@ workflow PEAKS {
     }
 
     CHROMHMM(ch_samplesheet_bam, chrom_sizes, chromhmm_states, chromhmm_threshold, chromhmm_marks)
-    ROSE(CHROMHMM.out.enhancers, rose_ucsc)
+    ROSE(CHROMHMM.out.enhancers, gtf.map{gtf -> [[id: "gtf"], gtf]})
 
     ch_versions = ch_versions.mix(CHROMHMM.out.versions)
     ch_versions = ch_versions.mix(ROSE.out.versions)

diff --git a/subworkflows/local/rose.nf b/subworkflows/local/rose.nf
@@ -1,17 +1,20 @@
 include { ROSE as RUN_ROSE           } from "../../modules/local/rose"
+include { UCSC_GTFTOGENEPRED              } from "../../modules/nf-core/ucsc/gtftogenepred"
 
 workflow ROSE {
     take:
     ch_bed
-    ucsc_file
+    ch_gtf
 
     main:
 
     ch_versions = Channel.empty()
 
-    RUN_ROSE(ch_bed, ucsc_file)
+    UCSC_GTFTOGENEPRED(ch_gtf)
+    RUN_ROSE(ch_bed, UCSC_GTFTOGENEPRED.out.genepred)
 
     ch_versions = ch_versions.mix(RUN_ROSE.out.versions)
+    ch_versions = ch_versions.mix(UCSC_GTFTOGENEPRED.out.versions)
 
     emit:
     enhancers = RUN_ROSE.out.stitched