From 3db7bd5cf2a39cbfac67f64e220a52e397192c72 Mon Sep 17 00:00:00 2001 From: Leon Hafner Date: Tue, 11 Jun 2024 11:07:49 +0200 Subject: [PATCH] Replaced ROSE with bedtools workflow --- conf/modules.config | 51 ++- modules/local/rose/main.nf | 27 -- modules/local/rose/templates/rose.py | 621 --------------------------- nextflow.config | 2 +- nextflow_schema.json | 2 +- subworkflows/local/peaks.nf | 6 +- subworkflows/local/rose.nf | 93 +++- workflows/tfactivity.nf | 2 +- 8 files changed, 139 insertions(+), 665 deletions(-) delete mode 100644 modules/local/rose/main.nf delete mode 100755 modules/local/rose/templates/rose.py diff --git a/conf/modules.config b/conf/modules.config index 97b7d44..38d5ee7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -28,6 +28,53 @@ process { ext.suffix = "bed" } + withName: FILTER_CONVERT_GTF { + ext.args = {"'BEGIN {OFS = \"\\t\"} \$3 == \"transcript\" {print \$1, \$4-1, \$4, \$1 \":\" \$4-1 \"-\" \$4, \$6, \$7}'"} + ext.suffix = "bed" + } + + withName: SORT_BED { + ext.args = "-k1,1 -k2,2n" + ext.prefix = {"${meta.id}_sorted"} + ext.suffix = "bed" + } + + withName: CONSTRUCT_TSS { + ext.args = "-b 2500" + ext.prefix = "tss" + } + + withName: FILTER_PREDICTIONS { + ext.args = "-A -f 1" + ext.prefix = {"${meta.id}_filtered"} + } + + withName: STITCHING { + ext.args = "-d 12500" + ext.prefix = {"${meta.id}_stitched"} + } + + withName: TSS_OVERLAP { + ext.args = "-c" + ext.prefix = {"${meta.id}_tss-overlap-counts"} + } + + withName: FILTER_OVERLAPS { + ext.args = {"'BEGIN {OFS = \"\\t\"} \$NF >= 2 {print \$1, \$2, \$3}'"} + ext.prefix = {"${meta.id}_overlap"} + } + + withName: UNSTITCHED_REGIONS { + ext.args = "-F 1" + ext.prefix = {"${meta.id}_original_regions"} + } + + withName: CONCAT_AND_SORT { + ext.args = "-k1,1 -k2,2n" + ext.prefix = {"${meta.id}_stitched"} + ext.suffix = "bed" + } + withName: BEDTOOLS_SORT { ext.prefix = {"${meta.id}.sorted"} } @@ -62,10 +109,6 @@ process { ext.prefix = {"${meta.id}_control"} } - withName: UCSC_GTFTOGENEPRED { - ext.args = "-genePredExt" - } - withName: ".*DYNAMITE:FILTER" { ext.args = {"'BEGIN{OFS=\"\\t\"} NR==1 || (\$2 >= ${params.dynamite_min_regression} || \$2 <= -${params.dynamite_min_regression} )'"} ext.prefix = {"${meta.id}.filtered"} diff --git a/modules/local/rose/main.nf b/modules/local/rose/main.nf deleted file mode 100644 index e9c99f3..0000000 --- a/modules/local/rose/main.nf +++ /dev/null @@ -1,27 +0,0 @@ -process ROSE { - tag "$meta.id" - label 'process_single' - - conda "conda-forge::mulled-v2-2076f4a3fb468a04063c9e6b7747a630abb457f6==fccb0c41a243c639e11dd1be7b74f563e624fcca-0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-2076f4a3fb468a04063c9e6b7747a630abb457f6:fccb0c41a243c639e11dd1be7b74f563e624fcca-0': - 'biocontainers/mulled-v2-2076f4a3fb468a04063c9e6b7747a630abb457f6:fccb0c41a243c639e11dd1be7b74f563e624fcca-0' }" - - input: - tuple val(meta), path(bed) - tuple val(meta2), path(genepred) - - output: - tuple val(meta), path("${meta.id}.rose.bed"), emit: stitched - path("versions.yml") , emit: versions - - script: - stitch = 12500 - tss_dist = 2500 - template "rose.py" - - stub: - """ - touch "${meta.id}.rose.bed" - """ -} diff --git a/modules/local/rose/templates/rose.py b/modules/local/rose/templates/rose.py deleted file mode 100755 index 407f825..0000000 --- a/modules/local/rose/templates/rose.py +++ /dev/null @@ -1,621 +0,0 @@ -#!/usr/bin/env python3 - -import os -import platform - -def format_yaml_like(data: dict, indent: int = 0) -> str: - """Formats a dictionary to a YAML-like string. - - Args: - data (dict): The dictionary to format. - indent (int): The current indentation level. - - Returns: - str: A string formatted as YAML. - """ - yaml_str = "" - for key, value in data.items(): - spaces = " " * indent - if isinstance(value, dict): - yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" - else: - yaml_str += f"{spaces}{key}: {value}\\n" - return yaml_str - -def region_stitching(bound_collection, stitch_window, tss_window, start_dict): - print('Performing region stitching...') - - remove_tss = tss_window != 0 - - # filter out all bound regions that overlap the TSS of an ACTIVE GENE - if remove_tss: - # now makeTSS loci for active genes - remove_ticker = 0 - # this loop makes a locus centered around +/- tss_window of transcribed genes - # then adds it to the list tss_loci - tss_loci = [] - for gene_id in list(start_dict.keys()): - tss_loci.append(make_tss_locus(gene_id, start_dict, tss_window, tss_window)) - - # this turns the tss_loci list into a LocusCollection - # 50 is the internal parameter for LocusCollection and doesn't really matter - tss_collection = LocusCollection(tss_loci, 50) - - # gives all the loci in bound_collection - bound_loci = bound_collection.get_loci() - - # this loop will check if each bound region is contained by the TSS exclusion zone - # this will drop out a lot of the promoter only regions that are tiny - # typical exclusion window is around 2kb - for locus in bound_loci: - if len(tss_collection.get_containers(locus, 'both')) > 0: - # if true, the bound locus overlaps an active gene - bound_collection.remove(locus) - remove_ticker += 1 - print(f'Removed {remove_ticker} loci because they were contained by a TSS') - - # bound_collection is now all enriched region loci that don't overlap an active TSS - stitched_collection = bound_collection.stitch_collection(stitch_window, 'both') - - if remove_tss: - # now replace any stitched region that overlap 2 distinct genes - # with the original loci that were there - fixed_loci = [] - tss_loci = [] - for gene_id in list(start_dict.keys()): - tss_loci.append(make_tss_locus(gene_id, start_dict, 50, 50)) - - # this turns the tss_loci list into a LocusCollection - # 50 is the internal parameter for LocusCollection and doesn't really matter - tss_collection = LocusCollection(tss_loci, 50) - remove_ticker = 0 - original_ticker = 0 - for stitched_locus in stitched_collection.get_loci(): - overlapping_tss_loci = tss_collection.get_overlap(stitched_locus, 'both') - tss_names = [start_dict[tssLocus.id()]['name'] for tssLocus in overlapping_tss_loci] - tss_names = uniquify(tss_names) - if len(tss_names) > 2: - original_loci = bound_collection.get_overlap(stitched_locus, 'both') - original_ticker += len(original_loci) - fixed_loci += original_loci - remove_ticker += 1 - else: - fixed_loci.append(stitched_locus) - - print(f'Removed {remove_ticker} stitched loci because they overlapped multiple TSSs') - print(f'Added back {original_ticker} original loci') - fixed_collection = LocusCollection(fixed_loci, 50) - return fixed_collection - else: - return stitched_collection - - -# ================================================================== -# ==========================I/O FUNCTIONS=========================== -# ================================================================== - -# unparse_table 4/14/08 -# takes in a table generated by parse_table and writes it to an output file -# takes as parameters (table, output, sep), where sep is how the file is delimited -# example call unparse_table(table, 'table.txt', '\t') for a tab del file - -def unparse_table(table, output, sep): - fh_out = open(output, 'w') - if len(sep) == 0: - for i in table: - fh_out.write(str(i)) - fh_out.write('\\n') - else: - for line in table: - line = [str(x) for x in line] - line = sep.join(line) - - fh_out.write(line) - fh_out.write('\\n') - - fh_out.close() - - -# parse_table 4/14/08 -# takes in a table where columns are separated by a given symbol and outputs -# a nested list such that list[row][col] -# example call: -# table = parse_table('file.txt','\t') -def parse_table(fn, sep, header=False, excel=False): - fh = open(fn) - lines = fh.readlines() - fh.close() - if excel: - lines = lines[0].split('\\r') - if lines[0].count('\\r') > 0: - lines = lines[0].split('\\r') - table = [] - if header: - lines = lines[1:] - for i in lines: - table.append(i[:-1].split(sep)) - - return table - - -def format_folder(folder_name, create=False): - """ - makes sure a folder exists and if not makes it - returns a bool for folder - """ - - if folder_name[-1] != '/': - folder_name += '/' - - try: - foo = os.listdir(folder_name) - return folder_name - except OSError: - print(f'folder {folder_name} does not exist') - if create: - os.system(f'mkdir {folder_name}') - return folder_name - else: - - return False - - # ================================================================== - - -# ===================ANNOTATION FUNCTIONS=========================== -# ================================================================== - - -def make_start_dict(annot_file): - transcripts = [] - - genepred_table, genepred_dict = import_genepred(annot_file) - if len(transcripts) == 0: - transcripts = list(genepred_dict.keys()) - start_dict = {} - for transcript in transcripts: - if transcript not in genepred_dict: - continue - start_dict[transcript] = {} - start_dict[transcript]['sense'] = genepred_table[genepred_dict[transcript][0]][2] - start_dict[transcript]['chr'] = genepred_table[genepred_dict[transcript][0]][1] - start_dict[transcript]['start'] = get_tsss([transcript], genepred_table, genepred_dict) - if start_dict[transcript]['sense'] == '+': - start_dict[transcript]['end'] = [int(genepred_table[genepred_dict[transcript][0]][4])] - else: - start_dict[transcript]['end'] = [int(genepred_table[genepred_dict[transcript][0]][3])] - start_dict[transcript]['name'] = genepred_table[genepred_dict[transcript][0]][11] - - return start_dict - - -# generic function to get the TSS of any gene -def get_tsss(gene_list, genepred_table, genepred_dict): - if len(gene_list) == 0: - genepred = genepred_table - else: - genepred = genepred_from_key(gene_list, genepred_dict, genepred_table) - tss = [] - for line in genepred: - if line[2] == '+': - tss.append(line[3]) - if line[2] == '-': - tss.append(line[4]) - tss = list(map(int, tss)) - - return tss - - -# 12/29/08 -# genepred_from_key(genepredKeyList,genepred_dict,genepred_table) -# function that grabs genepred lines from genepred IDs -def genepred_from_key(genepred_key_list, genepred_dict, genepred_table): - type_genepred = [] - for name in genepred_key_list: - if name in genepred_dict: - type_genepred.append(genepred_table[genepred_dict[name][0]]) - return type_genepred - - - -def import_genepred(genepred_file, return_multiples=False): - genepred_table = parse_table(genepred_file, '\\t') - genepred_dict = {} - ticker = 0 - for line in genepred_table: - transcript = line[0] - if transcript in genepred_dict: - genepred_dict[transcript].append(ticker) - else: - genepred_dict[transcript] = [ticker] - ticker = ticker + 1 - - multiples = [] - for i in genepred_dict: - if len(genepred_dict[i]) > 1: - multiples.append(i) - - if return_multiples: - return genepred_table, genepred_dict, multiples - else: - return genepred_table, genepred_dict - - -# ================================================================== -# ========================LOCUS INSTANCE============================ -# ================================================================== - -# Locus and LocusCollection instances courtesy of Graham Ruby - - -class Locus: - # this may save some space by reducing the number of chromosome strings - # that are associated with Locus instances (see __init__). - __chrDict = dict() - __senseDict = {'+': '+', '-': '-', '.': '.'} - - # chr = chromosome name (string) - # sense = '+' or '-' (or '.' for an ambidextrous locus) - # start,end = ints of the start and end coords of the locus - # end coord is the coord of the last nucleotide. - def __init__(self, chr, start, end, sense, id='', score=0): - coords = [int(start), int(end)] - coords.sort() - # this method for assigning chromosome should help avoid storage of - # redundant strings. - if chr not in self.__chrDict: - self.__chrDict[chr] = chr - self._chr = self.__chrDict[chr] - self._sense = self.__senseDict[sense] - self._start = int(coords[0]) - self._end = int(coords[1]) - self._id = id - self._score = score - - def id(self): - return self._id - - def chr(self): - return self._chr - - def start(self): - return self._start # returns the smallest coordinate - - def end(self): - return self._end # returns the biggest coordinate - - def len(self): - return self._end - self._start + 1 - - def get_antisense_locus(self): - if self._sense == '.': - return self - else: - switch = {'+': '-', '-': '+'} - return Locus(self._chr, self._start, self._end, switch[self._sense]) - - def coords(self): - return [self._start, self._end] # returns a sorted list of the coordinates - - def sense(self): - return self._sense - - def score(self): - return self._score - - # returns boolean; True if two loci share any coordinates in common - def overlaps(self, other_locus): - if self.chr() != other_locus.chr(): - return False - elif not (self._sense == '.' or other_locus.sense() == '.' or self.sense() == other_locus.sense()): - return False - elif self.start() > other_locus.end() or other_locus.start() > self.end(): - return False - else: - return True - - # returns boolean; True if all the nucleotides of the given locus overlap - # with the self locus - def contains(self, other_locus): - if self.chr() != other_locus.chr(): - return False - elif not (self._sense == '.' or other_locus.sense() == '.' or self.sense() == other_locus.sense()): - return False - elif self.start() > other_locus.start() or other_locus.end() > self.end(): - return False - else: - return True - - # same as overlaps, but considers the opposite strand - def overlaps_antisense(self, other_locus): - return self.get_antisense_locus().overlaps(other_locus) - - # same as contains, but considers the opposite strand - def contains_antisense(self, other_locus): - return self.get_antisense_locus().contains(other_locus) - - def __hash__(self): - return self._start + self._end - - def __eq__(self, other): - if self.__class__ != other.__class__: - return False - if self.chr() != other.chr(): - return False - if self.start() != other.start(): - return False - if self.end() != other.end(): - return False - if self.sense() != other.sense(): - return False - return True - - def __ne__(self, other): - return not (self.__eq__(other)) - - def __str__(self): - return self.chr() + '(' + self.sense() + '):' + '-'.join(map(str, self.coords())) - - def check_rep(self): - pass - - -class LocusCollection: - def __init__(self, loci, window_size): - self.__chr_to_coord_to_loci = dict() - self.__loci = dict() - self.__win_size = window_size - for lcs in loci: - self.__add_locus(lcs) - - def __add_locus(self, lcs): - if lcs not in self.__loci: - self.__loci[lcs] = None - if lcs.sense() == '.': - chr_key_list = [lcs.chr() + '+', lcs.chr() + '-'] - else: - chr_key_list = [lcs.chr() + lcs.sense()] - for chr_key in chr_key_list: - if chr_key not in self.__chr_to_coord_to_loci: - self.__chr_to_coord_to_loci[chr_key] = dict() - for n in self.__get_key_range(lcs): - if n not in self.__chr_to_coord_to_loci[chr_key]: - self.__chr_to_coord_to_loci[chr_key][n] = [] - self.__chr_to_coord_to_loci[chr_key][n].append(lcs) - - def __get_key_range(self, locus): - start = locus.start() // self.__win_size - # add 1 because of the range - end = locus.end() // self.__win_size + 1 - return range(start, end) - - def __len__(self): - return len(self.__loci) - - def append(self, new): - self.__add_locus(new) - - def extend(self, new_list): - for lcs in new_list: - self.__add_locus(lcs) - - def has_locus(self, locus): - return locus in self.__loci - - def remove(self, old): - if old not in self.__loci: - raise ValueError("requested locus isn't in collection") - del self.__loci[old] - if old.sense() == '.': - sense_list = ['+', '-'] - else: - sense_list = [old.sense()] - for k in self.__get_key_range(old): - for sense in sense_list: - self.__chr_to_coord_to_loci[old.chr() + sense][k].remove(old) - - def get_window_size(self): - return self.__win_size - - def get_loci(self): - return list(self.__loci.keys()) - - def get_chr_list(self): - # i need to remove the strand info from the chromosome keys and make - # them non-redundant. - temp_keys = dict() - for k in list(self.__chr_to_coord_to_loci.keys()): - temp_keys[k[:-1]] = None - return list(temp_keys.keys()) - - def __subset_helper(self, locus, sense): - sense = sense.lower() - if ['sense', 'antisense', 'both'].count(sense) != 1: - raise ValueError("sense command invalid: '" + sense + "'.") - matches = dict() - senses = ['+', '-'] - if locus.sense() == '.' or sense == 'both': - lamb = lambda s: True - elif sense == 'sense': - lamb = lambda s: s == locus.sense() - elif sense == 'antisense': - lamb = lambda s: s != locus.sense() - else: - raise ValueError("sense value was inappropriate: '" + sense + "'.") - for s in filter(lamb, senses): - chr_key = locus.chr() + s - if chr_key in self.__chr_to_coord_to_loci: - for n in self.__get_key_range(locus): - if n in self.__chr_to_coord_to_loci[chr_key]: - for lcs in self.__chr_to_coord_to_loci[chr_key][n]: - matches[lcs] = None - return list(matches.keys()) - - # sense can be 'sense' (default), 'antisense', or 'both' - # returns all members of the collection that overlap the locus - def get_overlap(self, locus, sense='sense'): - matches = self.__subset_helper(locus, sense) - # now, get rid of the ones that don't really overlap - real_matches = dict() - if sense == 'sense' or sense == 'both': - for i in [lcs for lcs in matches if lcs.overlaps(locus)]: - real_matches[i] = None - if sense == 'antisense' or sense == 'both': - for i in [lcs for lcs in matches if lcs.overlaps_antisense(locus)]: - real_matches[i] = None - return list(real_matches.keys()) - - # sense can be 'sense' (default), 'antisense', or 'both' - # returns all members of the collection that are contained by the locus - def get_contained(self, locus, sense='sense'): - matches = self.__subset_helper(locus, sense) - # now, get rid of the ones that don't really overlap - real_matches = dict() - if sense == 'sense' or sense == 'both': - for i in [lcs for lcs in matches if locus.contains(lcs)]: - real_matches[i] = None - if sense == 'antisense' or sense == 'both': - for i in [lcs for lcs in matches if locus.contains_antisense(lcs)]: - real_matches[i] = None - return list(real_matches.keys()) - - # sense can be 'sense' (default), 'antisense', or 'both' - # returns all members of the collection that contain the locus - def get_containers(self, locus, sense='sense'): - matches = self.__subset_helper(locus, sense) - # now, get rid of the ones that don't really overlap - real_matches = dict() - if sense == 'sense' or sense == 'both': - for i in [lcs for lcs in matches if lcs.contains(locus)]: - real_matches[i] = None - if sense == 'antisense' or sense == 'both': - for i in [lcs for lcs in matches if lcs.contains_antisense(locus)]: - real_matches[i] = None - return list(real_matches.keys()) - - def stitch_collection(self, stitch_window=1, sense='both'): - - """ - reduces the collection by stitching together overlapping loci - returns a new collection - """ - - # initializing stitch_window to 1 - # this helps collect directly adjacent loci - - locus_list = self.get_loci() - old_collection = LocusCollection(locus_list, 500) - - stitched_collection = LocusCollection([], 500) - - for locus in locus_list: - if old_collection.has_locus(locus): - old_collection.remove(locus) - overlapping_loci = old_collection.get_overlap( - Locus(locus.chr(), locus.start() - stitch_window, locus.end() + stitch_window, locus.sense(), - locus.id()), sense) - - stitch_ticker = 1 - while len(overlapping_loci) > 0: - stitch_ticker += len(overlapping_loci) - overlap_coords = locus.coords() - - for overlapping_locus in overlapping_loci: - overlap_coords += overlapping_locus.coords() - old_collection.remove(overlapping_locus) - if sense == 'both': - locus = Locus(locus.chr(), min(overlap_coords), max(overlap_coords), '.', locus.id()) - else: - locus = Locus(locus.chr(), min(overlap_coords), max(overlap_coords), locus.sense(), locus.id()) - overlapping_loci = old_collection.get_overlap( - Locus(locus.chr(), locus.start() - stitch_window, locus.end() + stitch_window, locus.sense()), - sense) - locus._id = f'{stitch_ticker}_{locus.id()}_lociStitched' - - stitched_collection.append(locus) - - else: - continue - return stitched_collection - - -# ================================================================== -# ========================LOCUS FUNCTIONS=========================== -# ================================================================== -# 06/11/09 -# turns a locusCollection into a bed -# does not write to disk though -def locus_collection_to_bed(locus_collection): - loci_list = locus_collection.get_loci() - bed = [] - for locus in loci_list: - new_line = [locus.chr(), locus.coords()[0], locus.coords()[1], locus.id(), locus.score(), locus.sense()] - bed.append(new_line) - return bed - - -def bed_to_locus_collection(bed, window=500): - """ - opens up a bed file and turns it into a LocusCollection instance - """ - - loci_list = [Locus(line[0], line[1], line[2], line[5], line[3]) - for line in parse_table(bed, '\\t')] - - return LocusCollection(loci_list, window) - - -def make_tss_locus(gene, start_dict, upstream, downstream): - """ - given a start_dict, make a locus for any gene's TSS w/ upstream and downstream windows - """ - - start = start_dict[gene]['start'][0] - if start_dict[gene]['sense'] == '-': - return Locus(start_dict[gene]['chr'], start - downstream, start + upstream, '-', gene) - else: - return Locus(start_dict[gene]['chr'], start - upstream, start + downstream, '+', gene) - - -# ================================================================== -# ========================MISC FUNCTIONS============================ -# ================================================================== - - -# uniquify function -# by Peter Bengtsson -# Used under a creative commons license -# sourced from here: http://www.peterbe.com/plog/uniqifiers-benchmark - -def uniquify(seq, idfun=None): - # order preserving - if idfun is None: - def idfun(x): return x - seen = {} - result = [] - for item in seq: - marker = idfun(item) - # in old Python versions: - # if seen.has_key(marker) - # but in new ones: - if marker in seen: continue - seen[marker] = 1 - result.append(item) - return result - - -start_dict = make_start_dict("$genepred") -locus_collection = bed_to_locus_collection("$bed") -stitched_collection = region_stitching(locus_collection, int("$stitch"), int("$tss_dist"), start_dict) -stitched = locus_collection_to_bed(stitched_collection) -unparse_table(stitched, "${meta.id}.rose.bed", '\\t') - -# Create version file -versions = { - "${task.process}" : { - "python": platform.python_version() - } -} - -with open("versions.yml", "w") as f: - f.write(format_yaml_like(versions)) diff --git a/nextflow.config b/nextflow.config index 2601f5d..cc01f1d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,7 +27,7 @@ params { expression_aggregation = 'mean' affinity_aggregation = 'max' chromhmm_states = 10 - chromhmm_threshold = 0.9 + chromhmm_threshold = 0.75 chromhmm_enhancer_marks = 'H3K27ac,H3K4me1' chromhmm_promoter_marks = 'H3K4me3' diff --git a/nextflow_schema.json b/nextflow_schema.json index 65d6a74..a23b893 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -131,7 +131,7 @@ }, "chromhmm_threshold": { "type": "number", - "default": 0.9, + "default": 0.75, "description": "Threshold for ChromHMM enhancer detection.", "fa_icon": "fas fa-compress-arrows-alt", "help_text": "Threshold for ChromHMM enhancer detection. The default value is 0.9." diff --git a/subworkflows/local/peaks.nf b/subworkflows/local/peaks.nf index e00482b..07abc6c 100644 --- a/subworkflows/local/peaks.nf +++ b/subworkflows/local/peaks.nf @@ -62,12 +62,12 @@ workflow PEAKS { } CHROMHMM(ch_samplesheet_bam, chrom_sizes, chromhmm_states, chromhmm_threshold, chromhmm_enhancer_marks, chromhmm_promoter_marks) - ROSE(CHROMHMM.out.enhancers, gtf) + ROSE(CHROMHMM.out.enhancers.mix(CHROMHMM.out.promoters), gtf, chrom_sizes) ch_versions = ch_versions.mix(CHROMHMM.out.versions) ch_versions = ch_versions.mix(ROSE.out.versions) - ch_peaks = ch_peaks .mix(ROSE.out.enhancers) + ch_peaks = ch_peaks .mix(ROSE.out.stitched) .map { meta, peaks -> [[ id: meta.id, condition: meta.condition, @@ -138,7 +138,7 @@ workflow PEAKS { emit: affinity_ratio = AFFINITY_RATIO.out.combined affinity_sum = AFFINITY_SUM.out.combined - enhancers = ROSE.out.enhancers + tf_bindings = ROSE.out.stitched versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/rose.nf b/subworkflows/local/rose.nf index b7af511..c6aa8f8 100644 --- a/subworkflows/local/rose.nf +++ b/subworkflows/local/rose.nf @@ -1,23 +1,102 @@ -include { ROSE as RUN_ROSE } from "../../modules/local/rose" -include { UCSC_GTFTOGENEPRED } from "../../modules/nf-core/ucsc/gtftogenepred" +include { GAWK as FILTER_CONVERT_GTF } from '../../modules/nf-core/gawk' +include { GNU_SORT as SORT_BED } from '../../modules/nf-core/gnu/sort' +include { BEDTOOLS_SLOP as CONSTRUCT_TSS } from '../../modules/nf-core/bedtools/slop' +include { BEDTOOLS_SUBTRACT as FILTER_PREDICTIONS } from '../../modules/nf-core/bedtools/subtract' +include { BEDTOOLS_COMPLEMENT as INVERT_TSS } from '../../modules/nf-core/bedtools/complement' +include { BEDTOOLS_MERGE as STITCHING } from '../../modules/nf-core/bedtools/merge' +include { BEDTOOLS_INTERSECT as TSS_OVERLAP } from '../../modules/nf-core/bedtools/intersect' +include { GAWK as FILTER_OVERLAPS } from '../../modules/nf-core/gawk' +include { BEDTOOLS_SUBTRACT as SUBTRACT_OVERLAPS } from '../../modules/nf-core/bedtools/subtract' +include { BEDTOOLS_INTERSECT as UNSTITCHED_REGIONS } from '../../modules/nf-core/bedtools/intersect' +include { GNU_SORT as CONCAT_AND_SORT } from '../../modules/nf-core/gnu/sort' workflow ROSE { take: ch_bed ch_gtf + chrom_sizes main: ch_versions = Channel.empty() - UCSC_GTFTOGENEPRED(ch_gtf) - RUN_ROSE(ch_bed, UCSC_GTFTOGENEPRED.out.genepred) + // Convert GTF to BED format and collapse regions to a single base pair at their start positions + FILTER_CONVERT_GTF(ch_gtf, []) - ch_versions = ch_versions.mix(RUN_ROSE.out.versions) - ch_versions = ch_versions.mix(UCSC_GTFTOGENEPRED.out.versions) + // Downstream methods require sorted inputs + SORT_BED(FILTER_CONVERT_GTF.out.output) + + // Construct 5000 bp window around transcription start site (TSS) + CONSTRUCT_TSS(SORT_BED.out.sorted, chrom_sizes.map{meta, file -> file}) + + INVERT_TSS(CONSTRUCT_TSS.out.bed, chrom_sizes.map{meta, file -> file}) + + ch_enhancers = ch_bed + .filter{meta, bed -> meta.assay.contains("enhancers")} + .combine(CONSTRUCT_TSS.out.bed) + .map{meta1, pred, meta2, tss -> [meta1, pred, tss]} + + ch_promoters = ch_bed + .filter{meta, bed -> meta.assay.contains("promoters")} + .combine(INVERT_TSS.out.bed) + .map{meta1, pred, meta2, non_tss -> [meta1, pred, non_tss]} + + // Remove predictions contained within a TSS + FILTER_PREDICTIONS(ch_enhancers.mix(ch_promoters)) + + // Merge regions closer than 12500 bp from each other + STITCHING(FILTER_PREDICTIONS.out.bed) + + // Get overlap counts of stitched regions with TSS + ch_tss_overlap = STITCHING.out.bed + .combine(CONSTRUCT_TSS.out.bed) + .map{meta1, stitched, meta2, tss -> [meta1, stitched, tss]} + + TSS_OVERLAP(ch_tss_overlap, [[], []]) + + // Filter regions that overlap at least 2 TSS + FILTER_OVERLAPS(TSS_OVERLAP.out.intersect, []) + + // Remove regions that overlap at least 2 TSS from stitched regions + ch_subtract_overlaps = STITCHING.out.bed + .combine(FILTER_OVERLAPS.out.output) + .filter{meta1, stitched, meta2, overlaps -> meta1.id == meta2.id} + .map{meta1, stitched, meta2, overlaps -> [meta1, stitched, overlaps]} + + SUBTRACT_OVERLAPS(ch_subtract_overlaps) + + // Get original regions (before stitching) of stitched regions that overlap at least 2 TSS + ch_unstitched_regions = FILTER_OVERLAPS.out.output + .combine(ch_bed) + .filter{meta1, overlaps, meta2, pred -> meta1.id == meta2.id} + .map{meta1, overlaps, meta2, pred -> [meta1, overlaps, pred]} + + UNSTITCHED_REGIONS(ch_unstitched_regions, [[], []]) + + // Combine correctly stitched (overlap with < 2 TSS) and original unstitched regions and sort + ch_concat_and_sort = SUBTRACT_OVERLAPS.out.bed + .combine(UNSTITCHED_REGIONS.out.intersect) + .filter{meta1, stitched, meta2, unstitched -> meta1.id == meta2.id} + .map{meta1, stitched, meta2, unstitched -> [meta1, [stitched, unstitched]]} + + CONCAT_AND_SORT(ch_concat_and_sort) + + ch_versions = ch_versions.mix( + FILTER_CONVERT_GTF.out.versions, + SORT_BED.out.versions, + CONSTRUCT_TSS.out.versions, + INVERT_TSS.out.versions, + FILTER_PREDICTIONS.out.versions, + STITCHING.out.versions, + TSS_OVERLAP.out.versions, + FILTER_OVERLAPS.out.versions, + SUBTRACT_OVERLAPS.out.versions, + UNSTITCHED_REGIONS.out.versions, + CONCAT_AND_SORT.out.versions, + ) emit: - enhancers = RUN_ROSE.out.stitched + stitched = CONCAT_AND_SORT.out.sorted versions = ch_versions } diff --git a/workflows/tfactivity.nf b/workflows/tfactivity.nf index 1b8b389..ee7d8c4 100644 --- a/workflows/tfactivity.nf +++ b/workflows/tfactivity.nf @@ -141,7 +141,7 @@ workflow TFACTIVITY { FIMO( fasta, RANKING.out.tf_total_ranking, - PEAKS.out.enhancers, + PEAKS.out.tf_bindings, MOTIFS.out.meme, )