From 3db7bd5cf2a39cbfac67f64e220a52e397192c72 Mon Sep 17 00:00:00 2001
From: Leon Hafner <leon.hafner@tum.de>
Date: Tue, 11 Jun 2024 11:07:49 +0200
Subject: [PATCH] Replaced ROSE with bedtools workflow

---
 conf/modules.config                  |  51 ++-
 modules/local/rose/main.nf           |  27 --
 modules/local/rose/templates/rose.py | 621 ---------------------------
 nextflow.config                      |   2 +-
 nextflow_schema.json                 |   2 +-
 subworkflows/local/peaks.nf          |   6 +-
 subworkflows/local/rose.nf           |  93 +++-
 workflows/tfactivity.nf              |   2 +-
 8 files changed, 139 insertions(+), 665 deletions(-)
 delete mode 100644 modules/local/rose/main.nf
 delete mode 100755 modules/local/rose/templates/rose.py

diff --git a/conf/modules.config b/conf/modules.config
index 97b7d44..38d5ee7 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -28,6 +28,53 @@ process {
         ext.suffix = "bed"
     }
 
+    withName: FILTER_CONVERT_GTF {
+        ext.args = {"'BEGIN {OFS = \"\\t\"} \$3 == \"transcript\" {print \$1, \$4-1, \$4, \$1 \":\" \$4-1 \"-\" \$4, \$6, \$7}'"}
+        ext.suffix = "bed"
+    }
+
+    withName: SORT_BED {
+        ext.args = "-k1,1 -k2,2n"
+        ext.prefix = {"${meta.id}_sorted"}
+        ext.suffix = "bed"
+    }
+
+    withName: CONSTRUCT_TSS {
+        ext.args = "-b 2500"
+        ext.prefix = "tss"
+    }
+
+    withName: FILTER_PREDICTIONS {
+        ext.args = "-A -f 1"
+        ext.prefix = {"${meta.id}_filtered"}
+    }
+
+    withName: STITCHING {
+        ext.args = "-d 12500"
+        ext.prefix = {"${meta.id}_stitched"}
+    }
+
+    withName: TSS_OVERLAP {
+        ext.args = "-c"
+        ext.prefix = {"${meta.id}_tss-overlap-counts"}
+    }
+
+    withName: FILTER_OVERLAPS {
+        ext.args = {"'BEGIN {OFS = \"\\t\"} \$NF >= 2 {print \$1, \$2, \$3}'"}
+        ext.prefix = {"${meta.id}_overlap"}
+    }
+
+    withName: UNSTITCHED_REGIONS {
+        ext.args = "-F 1"
+        ext.prefix = {"${meta.id}_original_regions"}
+    }
+
+    withName: CONCAT_AND_SORT {
+        ext.args = "-k1,1 -k2,2n"
+        ext.prefix = {"${meta.id}_stitched"}
+        ext.suffix = "bed"
+    }
+
     withName: BEDTOOLS_SORT {
         ext.prefix = {"${meta.id}.sorted"}
     }
@@ -62,10 +109,6 @@ process {
         ext.prefix = {"${meta.id}_control"}
     }
 
-    withName: UCSC_GTFTOGENEPRED {
-        ext.args = "-genePredExt"
-    }
-
     withName: ".*DYNAMITE:FILTER" {
         ext.args = {"'BEGIN{OFS=\"\\t\"} NR==1 || (\$2 >= ${params.dynamite_min_regression} || \$2 <= -${params.dynamite_min_regression} )'"}
         ext.prefix = {"${meta.id}.filtered"}
diff --git a/modules/local/rose/main.nf b/modules/local/rose/main.nf
deleted file mode 100644
index e9c99f3..0000000
--- a/modules/local/rose/main.nf
+++ /dev/null
@@ -1,27 +0,0 @@
-process ROSE {
-    tag "$meta.id"
-    label 'process_single'
-
-    conda "conda-forge::mulled-v2-2076f4a3fb468a04063c9e6b7747a630abb457f6==fccb0c41a243c639e11dd1be7b74f563e624fcca-0"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mulled-v2-2076f4a3fb468a04063c9e6b7747a630abb457f6:fccb0c41a243c639e11dd1be7b74f563e624fcca-0':
-        'biocontainers/mulled-v2-2076f4a3fb468a04063c9e6b7747a630abb457f6:fccb0c41a243c639e11dd1be7b74f563e624fcca-0' }"
-
-    input:
-    tuple val(meta), path(bed)
-    tuple val(meta2), path(genepred)
-
-    output:
-    tuple val(meta), path("${meta.id}.rose.bed"), emit: stitched
-    path("versions.yml")                        , emit: versions
-
-    script:
-    stitch = 12500
-    tss_dist = 2500
-    template "rose.py"
-
-    stub:
-    """
-    touch "${meta.id}.rose.bed"
-    """
-}
diff --git a/modules/local/rose/templates/rose.py b/modules/local/rose/templates/rose.py
deleted file mode 100755
index 407f825..0000000
--- a/modules/local/rose/templates/rose.py
+++ /dev/null
@@ -1,621 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import platform
-
-def format_yaml_like(data: dict, indent: int = 0) -> str:
-    """Formats a dictionary to a YAML-like string.
-
-    Args:
-        data (dict): The dictionary to format.
-        indent (int): The current indentation level.
-
-    Returns:
-        str: A string formatted as YAML.
-    """
-    yaml_str = ""
-    for key, value in data.items():
-        spaces = "  " * indent
-        if isinstance(value, dict):
-            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
-        else:
-            yaml_str += f"{spaces}{key}: {value}\\n"
-    return yaml_str
-
-def region_stitching(bound_collection, stitch_window, tss_window, start_dict):
-    print('Performing region stitching...')
-
-    remove_tss = tss_window != 0
-
-    # filter out all bound regions that overlap the TSS of an ACTIVE GENE
-    if remove_tss:
-        # now makeTSS loci for active genes
-        remove_ticker = 0
-        # this loop makes a locus centered around +/- tss_window of transcribed genes
-        # then adds it to the list tss_loci
-        tss_loci = []
-        for gene_id in list(start_dict.keys()):
-            tss_loci.append(make_tss_locus(gene_id, start_dict, tss_window, tss_window))
-
-        # this turns the tss_loci list into a LocusCollection
-        # 50 is the internal parameter for LocusCollection and doesn't really matter
-        tss_collection = LocusCollection(tss_loci, 50)
-
-        # gives all the loci in bound_collection
-        bound_loci = bound_collection.get_loci()
-
-        # this loop will check if each bound region is contained by the TSS exclusion zone
-        # this will drop out a lot of the promoter only regions that are tiny
-        # typical exclusion window is around 2kb
-        for locus in bound_loci:
-            if len(tss_collection.get_containers(locus, 'both')) > 0:
-                # if true, the bound locus overlaps an active gene
-                bound_collection.remove(locus)
-                remove_ticker += 1
-        print(f'Removed {remove_ticker} loci because they were contained by a TSS')
-
-    # bound_collection is now all enriched region loci that don't overlap an active TSS
-    stitched_collection = bound_collection.stitch_collection(stitch_window, 'both')
-
-    if remove_tss:
-        # now replace any stitched region that overlap 2 distinct genes
-        # with the original loci that were there
-        fixed_loci = []
-        tss_loci = []
-        for gene_id in list(start_dict.keys()):
-            tss_loci.append(make_tss_locus(gene_id, start_dict, 50, 50))
-
-        # this turns the tss_loci list into a LocusCollection
-        # 50 is the internal parameter for LocusCollection and doesn't really matter
-        tss_collection = LocusCollection(tss_loci, 50)
-        remove_ticker = 0
-        original_ticker = 0
-        for stitched_locus in stitched_collection.get_loci():
-            overlapping_tss_loci = tss_collection.get_overlap(stitched_locus, 'both')
-            tss_names = [start_dict[tssLocus.id()]['name'] for tssLocus in overlapping_tss_loci]
-            tss_names = uniquify(tss_names)
-            if len(tss_names) > 2:
-                original_loci = bound_collection.get_overlap(stitched_locus, 'both')
-                original_ticker += len(original_loci)
-                fixed_loci += original_loci
-                remove_ticker += 1
-            else:
-                fixed_loci.append(stitched_locus)
-
-        print(f'Removed {remove_ticker} stitched loci because they overlapped multiple TSSs')
-        print(f'Added back {original_ticker} original loci')
-        fixed_collection = LocusCollection(fixed_loci, 50)
-        return fixed_collection
-    else:
-        return stitched_collection
-
-
-# ==================================================================
-# ==========================I/O FUNCTIONS===========================
-# ==================================================================
-
-# unparse_table 4/14/08
-# takes in a table generated by parse_table and writes it to an output file
-# takes as parameters (table, output, sep), where sep is how the file is delimited
-# example call unparse_table(table, 'table.txt', '\t') for a tab del file
-
-def unparse_table(table, output, sep):
-    fh_out = open(output, 'w')
-    if len(sep) == 0:
-        for i in table:
-            fh_out.write(str(i))
-            fh_out.write('\\n')
-    else:
-        for line in table:
-            line = [str(x) for x in line]
-            line = sep.join(line)
-
-            fh_out.write(line)
-            fh_out.write('\\n')
-
-    fh_out.close()
-
-
-# parse_table 4/14/08
-# takes in a table where columns are separated by a given symbol and outputs
-# a nested list such that list[row][col]
-# example call:
-# table = parse_table('file.txt','\t')
-def parse_table(fn, sep, header=False, excel=False):
-    fh = open(fn)
-    lines = fh.readlines()
-    fh.close()
-    if excel:
-        lines = lines[0].split('\\r')
-    if lines[0].count('\\r') > 0:
-        lines = lines[0].split('\\r')
-    table = []
-    if header:
-        lines = lines[1:]
-    for i in lines:
-        table.append(i[:-1].split(sep))
-
-    return table
-
-
-def format_folder(folder_name, create=False):
-    """
-    makes sure a folder exists and if not makes it
-    returns a bool for folder
-    """
-
-    if folder_name[-1] != '/':
-        folder_name += '/'
-
-    try:
-        foo = os.listdir(folder_name)
-        return folder_name
-    except OSError:
-        print(f'folder {folder_name} does not exist')
-        if create:
-            os.system(f'mkdir {folder_name}')
-            return folder_name
-        else:
-
-            return False
-
-        # ==================================================================
-
-
-# ===================ANNOTATION FUNCTIONS===========================
-# ==================================================================
-
-
-def make_start_dict(annot_file):
-    transcripts = []
-
-    genepred_table, genepred_dict = import_genepred(annot_file)
-    if len(transcripts) == 0:
-        transcripts = list(genepred_dict.keys())
-    start_dict = {}
-    for transcript in transcripts:
-        if transcript not in genepred_dict:
-            continue
-        start_dict[transcript] = {}
-        start_dict[transcript]['sense'] = genepred_table[genepred_dict[transcript][0]][2]
-        start_dict[transcript]['chr'] = genepred_table[genepred_dict[transcript][0]][1]
-        start_dict[transcript]['start'] = get_tsss([transcript], genepred_table, genepred_dict)
-        if start_dict[transcript]['sense'] == '+':
-            start_dict[transcript]['end'] = [int(genepred_table[genepred_dict[transcript][0]][4])]
-        else:
-            start_dict[transcript]['end'] = [int(genepred_table[genepred_dict[transcript][0]][3])]
-        start_dict[transcript]['name'] = genepred_table[genepred_dict[transcript][0]][11]
-
-    return start_dict
-
-
-# generic function to get the TSS of any gene
-def get_tsss(gene_list, genepred_table, genepred_dict):
-    if len(gene_list) == 0:
-        genepred = genepred_table
-    else:
-        genepred = genepred_from_key(gene_list, genepred_dict, genepred_table)
-    tss = []
-    for line in genepred:
-        if line[2] == '+':
-            tss.append(line[3])
-        if line[2] == '-':
-            tss.append(line[4])
-    tss = list(map(int, tss))
-
-    return tss
-
-
-# 12/29/08
-# genepred_from_key(genepredKeyList,genepred_dict,genepred_table)
-# function that grabs genepred lines from genepred IDs
-def genepred_from_key(genepred_key_list, genepred_dict, genepred_table):
-    type_genepred = []
-    for name in genepred_key_list:
-        if name in genepred_dict:
-            type_genepred.append(genepred_table[genepred_dict[name][0]])
-    return type_genepred
-
-
-
-def import_genepred(genepred_file, return_multiples=False):
-    genepred_table = parse_table(genepred_file, '\\t')
-    genepred_dict = {}
-    ticker = 0
-    for line in genepred_table:
-        transcript = line[0]
-        if transcript in genepred_dict:
-            genepred_dict[transcript].append(ticker)
-        else:
-            genepred_dict[transcript] = [ticker]
-        ticker = ticker + 1
-
-    multiples = []
-    for i in genepred_dict:
-        if len(genepred_dict[i]) > 1:
-            multiples.append(i)
-
-    if return_multiples:
-        return genepred_table, genepred_dict, multiples
-    else:
-        return genepred_table, genepred_dict
-
-
-# ==================================================================
-# ========================LOCUS INSTANCE============================
-# ==================================================================
-
-# Locus and LocusCollection instances courtesy of Graham Ruby
-
-
-class Locus:
-    # this may save some space by reducing the number of chromosome strings
-    # that are associated with Locus instances (see __init__).
-    __chrDict = dict()
-    __senseDict = {'+': '+', '-': '-', '.': '.'}
-
-    # chr = chromosome name (string)
-    # sense = '+' or '-' (or '.' for an ambidextrous locus)
-    # start,end = ints of the start and end coords of the locus
-    #      end coord is the coord of the last nucleotide.
-    def __init__(self, chr, start, end, sense, id='', score=0):
-        coords = [int(start), int(end)]
-        coords.sort()
-        # this method for assigning chromosome should help avoid storage of
-        # redundant strings.
-        if chr not in self.__chrDict:
-            self.__chrDict[chr] = chr
-        self._chr = self.__chrDict[chr]
-        self._sense = self.__senseDict[sense]
-        self._start = int(coords[0])
-        self._end = int(coords[1])
-        self._id = id
-        self._score = score
-
-    def id(self):
-        return self._id
-
-    def chr(self):
-        return self._chr
-
-    def start(self):
-        return self._start  # returns the smallest coordinate
-
-    def end(self):
-        return self._end  # returns the biggest coordinate
-
-    def len(self):
-        return self._end - self._start + 1
-
-    def get_antisense_locus(self):
-        if self._sense == '.':
-            return self
-        else:
-            switch = {'+': '-', '-': '+'}
-            return Locus(self._chr, self._start, self._end, switch[self._sense])
-
-    def coords(self):
-        return [self._start, self._end]  # returns a sorted list of the coordinates
-
-    def sense(self):
-        return self._sense
-
-    def score(self):
-        return self._score
-
-    # returns boolean; True if two loci share any coordinates in common
-    def overlaps(self, other_locus):
-        if self.chr() != other_locus.chr():
-            return False
-        elif not (self._sense == '.' or other_locus.sense() == '.' or self.sense() == other_locus.sense()):
-            return False
-        elif self.start() > other_locus.end() or other_locus.start() > self.end():
-            return False
-        else:
-            return True
-
-    # returns boolean; True if all the nucleotides of the given locus overlap
-    #      with the self locus
-    def contains(self, other_locus):
-        if self.chr() != other_locus.chr():
-            return False
-        elif not (self._sense == '.' or other_locus.sense() == '.' or self.sense() == other_locus.sense()):
-            return False
-        elif self.start() > other_locus.start() or other_locus.end() > self.end():
-            return False
-        else:
-            return True
-
-    # same as overlaps, but considers the opposite strand
-    def overlaps_antisense(self, other_locus):
-        return self.get_antisense_locus().overlaps(other_locus)
-
-    # same as contains, but considers the opposite strand
-    def contains_antisense(self, other_locus):
-        return self.get_antisense_locus().contains(other_locus)
-
-    def __hash__(self):
-        return self._start + self._end
-
-    def __eq__(self, other):
-        if self.__class__ != other.__class__:
-            return False
-        if self.chr() != other.chr():
-            return False
-        if self.start() != other.start():
-            return False
-        if self.end() != other.end():
-            return False
-        if self.sense() != other.sense():
-            return False
-        return True
-
-    def __ne__(self, other):
-        return not (self.__eq__(other))
-
-    def __str__(self):
-        return self.chr() + '(' + self.sense() + '):' + '-'.join(map(str, self.coords()))
-
-    def check_rep(self):
-        pass
-
-
-class LocusCollection:
-    def __init__(self, loci, window_size):
-        self.__chr_to_coord_to_loci = dict()
-        self.__loci = dict()
-        self.__win_size = window_size
-        for lcs in loci:
-            self.__add_locus(lcs)
-
-    def __add_locus(self, lcs):
-        if lcs not in self.__loci:
-            self.__loci[lcs] = None
-            if lcs.sense() == '.':
-                chr_key_list = [lcs.chr() + '+', lcs.chr() + '-']
-            else:
-                chr_key_list = [lcs.chr() + lcs.sense()]
-            for chr_key in chr_key_list:
-                if chr_key not in self.__chr_to_coord_to_loci:
-                    self.__chr_to_coord_to_loci[chr_key] = dict()
-                for n in self.__get_key_range(lcs):
-                    if n not in self.__chr_to_coord_to_loci[chr_key]:
-                        self.__chr_to_coord_to_loci[chr_key][n] = []
-                    self.__chr_to_coord_to_loci[chr_key][n].append(lcs)
-
-    def __get_key_range(self, locus):
-        start = locus.start() // self.__win_size
-        # add 1 because of the range
-        end = locus.end() // self.__win_size + 1
-        return range(start, end)
-
-    def __len__(self):
-        return len(self.__loci)
-
-    def append(self, new):
-        self.__add_locus(new)
-
-    def extend(self, new_list):
-        for lcs in new_list:
-            self.__add_locus(lcs)
-
-    def has_locus(self, locus):
-        return locus in self.__loci
-
-    def remove(self, old):
-        if old not in self.__loci:
-            raise ValueError("requested locus isn't in collection")
-        del self.__loci[old]
-        if old.sense() == '.':
-            sense_list = ['+', '-']
-        else:
-            sense_list = [old.sense()]
-        for k in self.__get_key_range(old):
-            for sense in sense_list:
-                self.__chr_to_coord_to_loci[old.chr() + sense][k].remove(old)
-
-    def get_window_size(self):
-        return self.__win_size
-
-    def get_loci(self):
-        return list(self.__loci.keys())
-
-    def get_chr_list(self):
-        # i need to remove the strand info from the chromosome keys and make
-        # them non-redundant.
-        temp_keys = dict()
-        for k in list(self.__chr_to_coord_to_loci.keys()):
-            temp_keys[k[:-1]] = None
-        return list(temp_keys.keys())
-
-    def __subset_helper(self, locus, sense):
-        sense = sense.lower()
-        if ['sense', 'antisense', 'both'].count(sense) != 1:
-            raise ValueError("sense command invalid: '" + sense + "'.")
-        matches = dict()
-        senses = ['+', '-']
-        if locus.sense() == '.' or sense == 'both':
-            lamb = lambda s: True
-        elif sense == 'sense':
-            lamb = lambda s: s == locus.sense()
-        elif sense == 'antisense':
-            lamb = lambda s: s != locus.sense()
-        else:
-            raise ValueError("sense value was inappropriate: '" + sense + "'.")
-        for s in filter(lamb, senses):
-            chr_key = locus.chr() + s
-            if chr_key in self.__chr_to_coord_to_loci:
-                for n in self.__get_key_range(locus):
-                    if n in self.__chr_to_coord_to_loci[chr_key]:
-                        for lcs in self.__chr_to_coord_to_loci[chr_key][n]:
-                            matches[lcs] = None
-        return list(matches.keys())
-
-    # sense can be 'sense' (default), 'antisense', or 'both'
-    # returns all members of the collection that overlap the locus
-    def get_overlap(self, locus, sense='sense'):
-        matches = self.__subset_helper(locus, sense)
-        # now, get rid of the ones that don't really overlap
-        real_matches = dict()
-        if sense == 'sense' or sense == 'both':
-            for i in [lcs for lcs in matches if lcs.overlaps(locus)]:
-                real_matches[i] = None
-        if sense == 'antisense' or sense == 'both':
-            for i in [lcs for lcs in matches if lcs.overlaps_antisense(locus)]:
-                real_matches[i] = None
-        return list(real_matches.keys())
-
-    # sense can be 'sense' (default), 'antisense', or 'both'
-    # returns all members of the collection that are contained by the locus
-    def get_contained(self, locus, sense='sense'):
-        matches = self.__subset_helper(locus, sense)
-        # now, get rid of the ones that don't really overlap
-        real_matches = dict()
-        if sense == 'sense' or sense == 'both':
-            for i in [lcs for lcs in matches if locus.contains(lcs)]:
-                real_matches[i] = None
-        if sense == 'antisense' or sense == 'both':
-            for i in [lcs for lcs in matches if locus.contains_antisense(lcs)]:
-                real_matches[i] = None
-        return list(real_matches.keys())
-
-    # sense can be 'sense' (default), 'antisense', or 'both'
-    # returns all members of the collection that contain the locus
-    def get_containers(self, locus, sense='sense'):
-        matches = self.__subset_helper(locus, sense)
-        # now, get rid of the ones that don't really overlap
-        real_matches = dict()
-        if sense == 'sense' or sense == 'both':
-            for i in [lcs for lcs in matches if lcs.contains(locus)]:
-                real_matches[i] = None
-        if sense == 'antisense' or sense == 'both':
-            for i in [lcs for lcs in matches if lcs.contains_antisense(locus)]:
-                real_matches[i] = None
-        return list(real_matches.keys())
-
-    def stitch_collection(self, stitch_window=1, sense='both'):
-
-        """
-        reduces the collection by stitching together overlapping loci
-        returns a new collection
-        """
-
-        # initializing stitch_window to 1
-        # this helps collect directly adjacent loci
-
-        locus_list = self.get_loci()
-        old_collection = LocusCollection(locus_list, 500)
-
-        stitched_collection = LocusCollection([], 500)
-
-        for locus in locus_list:
-            if old_collection.has_locus(locus):
-                old_collection.remove(locus)
-                overlapping_loci = old_collection.get_overlap(
-                    Locus(locus.chr(), locus.start() - stitch_window, locus.end() + stitch_window, locus.sense(),
-                          locus.id()), sense)
-
-                stitch_ticker = 1
-                while len(overlapping_loci) > 0:
-                    stitch_ticker += len(overlapping_loci)
-                    overlap_coords = locus.coords()
-
-                    for overlapping_locus in overlapping_loci:
-                        overlap_coords += overlapping_locus.coords()
-                        old_collection.remove(overlapping_locus)
-                    if sense == 'both':
-                        locus = Locus(locus.chr(), min(overlap_coords), max(overlap_coords), '.', locus.id())
-                    else:
-                        locus = Locus(locus.chr(), min(overlap_coords), max(overlap_coords), locus.sense(), locus.id())
-                    overlapping_loci = old_collection.get_overlap(
-                        Locus(locus.chr(), locus.start() - stitch_window, locus.end() + stitch_window, locus.sense()),
-                        sense)
-                locus._id = f'{stitch_ticker}_{locus.id()}_lociStitched'
-
-                stitched_collection.append(locus)
-
-            else:
-                continue
-        return stitched_collection
-
-
-# ==================================================================
-# ========================LOCUS FUNCTIONS===========================
-# ==================================================================
-# 06/11/09
-# turns a locusCollection into a bed
-# does not write to disk though
-def locus_collection_to_bed(locus_collection):
-    loci_list = locus_collection.get_loci()
-    bed = []
-    for locus in loci_list:
-        new_line = [locus.chr(), locus.coords()[0], locus.coords()[1], locus.id(), locus.score(), locus.sense()]
-        bed.append(new_line)
-    return bed
-
-
-def bed_to_locus_collection(bed, window=500):
-    """
-    opens up a bed file and turns it into a LocusCollection instance
-    """
-
-    loci_list = [Locus(line[0], line[1], line[2], line[5], line[3])
-                    for line in parse_table(bed, '\\t')]
-
-    return LocusCollection(loci_list, window)
-
-
-def make_tss_locus(gene, start_dict, upstream, downstream):
-    """
-    given a start_dict, make a locus for any gene's TSS w/ upstream and downstream windows
-    """
-
-    start = start_dict[gene]['start'][0]
-    if start_dict[gene]['sense'] == '-':
-        return Locus(start_dict[gene]['chr'], start - downstream, start + upstream, '-', gene)
-    else:
-        return Locus(start_dict[gene]['chr'], start - upstream, start + downstream, '+', gene)
-
-
-# ==================================================================
-# ========================MISC FUNCTIONS============================
-# ==================================================================
-
-
-# uniquify function
-# by Peter Bengtsson
-# Used under a creative commons license
-# sourced from  here: http://www.peterbe.com/plog/uniqifiers-benchmark
-
-def uniquify(seq, idfun=None):
-    # order preserving
-    if idfun is None:
-        def idfun(x): return x
-    seen = {}
-    result = []
-    for item in seq:
-        marker = idfun(item)
-        # in old Python versions:
-        # if seen.has_key(marker)
-        # but in new ones:
-        if marker in seen: continue
-        seen[marker] = 1
-        result.append(item)
-    return result
-
-
-start_dict = make_start_dict("$genepred")
-locus_collection = bed_to_locus_collection("$bed")
-stitched_collection = region_stitching(locus_collection, int("$stitch"), int("$tss_dist"), start_dict)
-stitched = locus_collection_to_bed(stitched_collection)
-unparse_table(stitched, "${meta.id}.rose.bed", '\\t')
-
-# Create version file
-versions = {
-    "${task.process}" : {
-        "python": platform.python_version()
-    }
-}
-
-with open("versions.yml", "w") as f:
-    f.write(format_yaml_like(versions))
diff --git a/nextflow.config b/nextflow.config
index 2601f5d..cc01f1d 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -27,7 +27,7 @@ params {
     expression_aggregation     = 'mean'
     affinity_aggregation       = 'max'
     chromhmm_states            = 10
-    chromhmm_threshold         = 0.9
+    chromhmm_threshold         = 0.75
     chromhmm_enhancer_marks    = 'H3K27ac,H3K4me1'
     chromhmm_promoter_marks    = 'H3K4me3'
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 65d6a74..a23b893 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -131,7 +131,7 @@
                 },
                 "chromhmm_threshold": {
                     "type": "number",
-                    "default": 0.9,
+                    "default": 0.75,
                     "description": "Threshold for ChromHMM enhancer detection.",
                     "fa_icon": "fas fa-compress-arrows-alt",
                     "help_text": "Threshold for ChromHMM enhancer detection. The default value is 0.9."
diff --git a/subworkflows/local/peaks.nf b/subworkflows/local/peaks.nf
index e00482b..07abc6c 100644
--- a/subworkflows/local/peaks.nf
+++ b/subworkflows/local/peaks.nf
@@ -62,12 +62,12 @@ workflow PEAKS {
     }
 
     CHROMHMM(ch_samplesheet_bam, chrom_sizes, chromhmm_states, chromhmm_threshold, chromhmm_enhancer_marks, chromhmm_promoter_marks)
-    ROSE(CHROMHMM.out.enhancers, gtf)
+    ROSE(CHROMHMM.out.enhancers.mix(CHROMHMM.out.promoters), gtf, chrom_sizes)
 
     ch_versions = ch_versions.mix(CHROMHMM.out.versions)
     ch_versions = ch_versions.mix(ROSE.out.versions)
 
-    ch_peaks = ch_peaks .mix(ROSE.out.enhancers)
+    ch_peaks = ch_peaks .mix(ROSE.out.stitched)
                         .map { meta, peaks -> [[
                             id: meta.id,
                             condition: meta.condition,
@@ -138,7 +138,7 @@ workflow PEAKS {
     emit:
     affinity_ratio = AFFINITY_RATIO.out.combined
     affinity_sum = AFFINITY_SUM.out.combined
-    enhancers = ROSE.out.enhancers
+    tf_bindings = ROSE.out.stitched
 
     versions = ch_versions                     // channel: [ versions.yml ]
 }
diff --git a/subworkflows/local/rose.nf b/subworkflows/local/rose.nf
index b7af511..c6aa8f8 100644
--- a/subworkflows/local/rose.nf
+++ b/subworkflows/local/rose.nf
@@ -1,23 +1,102 @@
-include { ROSE as RUN_ROSE           } from "../../modules/local/rose"
-include { UCSC_GTFTOGENEPRED              } from "../../modules/nf-core/ucsc/gtftogenepred"
+include { GAWK as FILTER_CONVERT_GTF } from '../../modules/nf-core/gawk'
+include { GNU_SORT as SORT_BED } from '../../modules/nf-core/gnu/sort'
+include { BEDTOOLS_SLOP as CONSTRUCT_TSS } from '../../modules/nf-core/bedtools/slop'
+include { BEDTOOLS_SUBTRACT as FILTER_PREDICTIONS } from '../../modules/nf-core/bedtools/subtract'
+include { BEDTOOLS_COMPLEMENT as INVERT_TSS } from '../../modules/nf-core/bedtools/complement'
+include { BEDTOOLS_MERGE as STITCHING } from '../../modules/nf-core/bedtools/merge'
+include { BEDTOOLS_INTERSECT as TSS_OVERLAP } from '../../modules/nf-core/bedtools/intersect'
+include { GAWK as FILTER_OVERLAPS } from '../../modules/nf-core/gawk'
+include { BEDTOOLS_SUBTRACT as SUBTRACT_OVERLAPS } from '../../modules/nf-core/bedtools/subtract'
+include { BEDTOOLS_INTERSECT as UNSTITCHED_REGIONS } from '../../modules/nf-core/bedtools/intersect'
+include { GNU_SORT as CONCAT_AND_SORT } from '../../modules/nf-core/gnu/sort'
 
 workflow ROSE {
     take:
     ch_bed
     ch_gtf
+    chrom_sizes
 
     main:
 
     ch_versions = Channel.empty()
 
-    UCSC_GTFTOGENEPRED(ch_gtf)
-    RUN_ROSE(ch_bed, UCSC_GTFTOGENEPRED.out.genepred)
+    // Convert GTF to BED format and collapse regions to a single base pair at their start positions
+    FILTER_CONVERT_GTF(ch_gtf, [])
 
-    ch_versions = ch_versions.mix(RUN_ROSE.out.versions)
-    ch_versions = ch_versions.mix(UCSC_GTFTOGENEPRED.out.versions)
+    // Downstream methods require sorted inputs
+    SORT_BED(FILTER_CONVERT_GTF.out.output)
+
+    // Construct 5000 bp window around transcription start site (TSS)
+    CONSTRUCT_TSS(SORT_BED.out.sorted, chrom_sizes.map{meta, file -> file})
+
+    INVERT_TSS(CONSTRUCT_TSS.out.bed, chrom_sizes.map{meta, file -> file})
+
+    ch_enhancers = ch_bed
+        .filter{meta, bed -> meta.assay.contains("enhancers")}
+        .combine(CONSTRUCT_TSS.out.bed)
+        .map{meta1, pred, meta2, tss -> [meta1, pred, tss]}
+
+    ch_promoters = ch_bed
+        .filter{meta, bed -> meta.assay.contains("promoters")}
+        .combine(INVERT_TSS.out.bed)
+        .map{meta1, pred, meta2, non_tss -> [meta1, pred, non_tss]}
+
+    // Remove predictions contained within a TSS
+    FILTER_PREDICTIONS(ch_enhancers.mix(ch_promoters))
+
+    // Merge regions closer than 12500 bp from each other
+    STITCHING(FILTER_PREDICTIONS.out.bed)
+
+    // Get overlap counts of stitched regions with TSS
+    ch_tss_overlap = STITCHING.out.bed
+        .combine(CONSTRUCT_TSS.out.bed)
+        .map{meta1, stitched, meta2, tss -> [meta1, stitched, tss]}
+
+    TSS_OVERLAP(ch_tss_overlap, [[], []])
+
+    // Filter regions that overlap at least 2 TSS
+    FILTER_OVERLAPS(TSS_OVERLAP.out.intersect, [])
+
+    // Remove regions that overlap at least 2 TSS from stitched regions
+    ch_subtract_overlaps = STITCHING.out.bed
+        .combine(FILTER_OVERLAPS.out.output)
+        .filter{meta1, stitched, meta2, overlaps -> meta1.id == meta2.id}
+        .map{meta1, stitched, meta2, overlaps -> [meta1, stitched, overlaps]}
+
+    SUBTRACT_OVERLAPS(ch_subtract_overlaps)
+
+    // Get original regions (before stitching) of stitched regions that overlap at least 2 TSS
+    ch_unstitched_regions = FILTER_OVERLAPS.out.output
+        .combine(ch_bed)
+        .filter{meta1, overlaps, meta2, pred -> meta1.id == meta2.id}
+        .map{meta1, overlaps, meta2, pred -> [meta1, overlaps, pred]}
+
+    UNSTITCHED_REGIONS(ch_unstitched_regions, [[], []])
+
+    // Combine correctly stitched (overlap with < 2 TSS) and original unstitched regions and sort
+    ch_concat_and_sort = SUBTRACT_OVERLAPS.out.bed
+        .combine(UNSTITCHED_REGIONS.out.intersect)
+        .filter{meta1, stitched, meta2, unstitched -> meta1.id == meta2.id}
+        .map{meta1, stitched, meta2, unstitched -> [meta1, [stitched, unstitched]]}
+
+    CONCAT_AND_SORT(ch_concat_and_sort)
+
+    ch_versions = ch_versions.mix(
+        FILTER_CONVERT_GTF.out.versions,
+        SORT_BED.out.versions,
+        CONSTRUCT_TSS.out.versions,
+        INVERT_TSS.out.versions,
+        FILTER_PREDICTIONS.out.versions,
+        STITCHING.out.versions,
+        TSS_OVERLAP.out.versions,
+        FILTER_OVERLAPS.out.versions,
+        SUBTRACT_OVERLAPS.out.versions,
+        UNSTITCHED_REGIONS.out.versions,
+        CONCAT_AND_SORT.out.versions,
+    )
 
     emit:
-    enhancers = RUN_ROSE.out.stitched
+    stitched = CONCAT_AND_SORT.out.sorted
 
     versions = ch_versions
 }
diff --git a/workflows/tfactivity.nf b/workflows/tfactivity.nf
index 1b8b389..ee7d8c4 100644
--- a/workflows/tfactivity.nf
+++ b/workflows/tfactivity.nf
@@ -141,7 +141,7 @@ workflow TFACTIVITY {
     FIMO(
         fasta,
         RANKING.out.tf_total_ranking,
-        PEAKS.out.enhancers,
+        PEAKS.out.tf_bindings,
         MOTIFS.out.meme,
     )