From 2e5837c1420da880d725f422f66966f6d5efd1f2 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Thu, 10 Oct 2024 17:17:23 -0400 Subject: [PATCH] rewrite PeakIO.py, PairedEndTrack.py; --- MACS3/IO/{PeakIO.pyx => PeakIO.py} | 1010 +++++++++++++++++----------- MACS3/Signal/CallPeakUnit.pyx | 4 +- MACS3/Signal/PairedEndTrack.py | 730 ++++++++++++++++++++ MACS3/Signal/PairedEndTrack.pyx | 584 ---------------- MACS3/Signal/ScoreTrack.pyx | 4 +- setup.py | 4 +- 6 files changed, 1350 insertions(+), 986 deletions(-) rename MACS3/IO/{PeakIO.pyx => PeakIO.py} (60%) create mode 100644 MACS3/Signal/PairedEndTrack.py delete mode 100644 MACS3/Signal/PairedEndTrack.pyx diff --git a/MACS3/IO/PeakIO.pyx b/MACS3/IO/PeakIO.py similarity index 60% rename from MACS3/IO/PeakIO.pyx rename to MACS3/IO/PeakIO.py index e959db25..433dafbf 100644 --- a/MACS3/IO/PeakIO.pyx +++ b/MACS3/IO/PeakIO.py @@ -1,6 +1,6 @@ # cython: language_level=3 # cython: profile=True -# Time-stamp: <2024-09-06 14:56:51 Tao Liu> +# Time-stamp: <2024-10-10 17:00:18 Tao Liu> """Module for PeakIO IO classes. @@ -22,25 +22,25 @@ # MACS3 modules # ------------------------------------ -from MACS3.Utilities.Constants import * +# from MACS3.Utilities.Constants import * # ------------------------------------ # Other modules # ------------------------------------ - -from cpython cimport bool +import cython +from cython.cimports.cpython import bool # ------------------------------------ # constants # ------------------------------------ -__version__ = "PeakIO $Revision$" -__author__ = "Tao Liu " -__doc__ = "PeakIO class" # ------------------------------------ # Misc functions # ------------------------------------ -cdef str subpeak_letters( int i): + + +@cython.cfunc +def subpeak_letters(i: cython.int) -> str: if i < 26: return chr(97+i) else: @@ -50,24 +50,32 @@ # Classes # ------------------------------------ -cdef class PeakContent: - cdef: - bytes chrom - int start - int end - int length - int summit - float score - float pileup - float pscore - float fc - float qscore - bytes name - - def __init__ ( self, bytes chrom, int start, int end, int summit, - float peak_score, float pileup, - float pscore, float fold_change, float qscore, - bytes name= b"NA" ): + +@cython.cclass +class PeakContent: + chrom: bytes + start: cython.int + end: cython.int + length: cython.int + summit: cython.int + score: cython.float + pileup: cython.float + pscore: cython.float + fc: cython.float + qscore: cython.float + name: bytes + + def __init__(self, + chrom: bytes, + start: cython.int, + end: cython.int, + summit: cython.int, + peak_score: cython.float, + pileup: cython.float, + pscore: cython.float, + fold_change: cython.float, + qscore: cython.float, + name: bytes = b"NA"): self.chrom = chrom self.start = start self.end = end @@ -80,7 +88,7 @@ def __init__ ( self, bytes chrom, int start, int end, int summit, self.qscore = qscore self.name = name - def __getitem__ ( self, a ): + def __getitem__(self, a: str): if a == "chrom": return self.chrom elif a == "start": @@ -104,7 +112,7 @@ def __getitem__ ( self, a ): elif a == "name": return self.name - def __setitem__ ( self, a, v ): + def __setitem__(self, a: str, v): if a == "chrom": self.chrom = v elif a == "start": @@ -128,27 +136,42 @@ def __setitem__ ( self, a, v ): elif a == "name": self.name = v - def __str__ (self): - return "chrom:%s;start:%d;end:%d;score:%f" % ( self.chrom, self.start, self.end, self.score ) + def __str__(self): + return "chrom:%s;start:%d;end:%d;score:%f" % (self.chrom, + self.start, + self.end, + self.score) + -cdef class PeakIO: +@cython.cclass +class PeakIO: """IO for peak information. """ - cdef: - public dict peaks # dictionary storing peak contents - public bool CO_sorted # whether peaks have been sorted by coordinations - public long total # total number of peaks - - def __init__ (self): + # dictionary storing peak contents + peaks = cython.declare(dict, visibility="public") + # whether peaks have been sorted by coordinations + CO_sorted = cython.declare(bool, visibility="public") + # total number of peaks + total = cython.declare(cython.long, visibility="public") + + def __init__(self): self.peaks = {} self.CO_sorted = False self.total = 0 - cpdef add (self, bytes chromosome, int start, int end, int summit = 0, - float peak_score = 0, float pileup = 0, - float pscore = 0, float fold_change = 0, float qscore = 0, - bytes name = b"NA"): + @cython.ccall + def add(self, + chromosome: bytes, + start: cython.int, + end: cython.int, + summit: cython.int = 0, + peak_score: cython.float = 0, + pileup: cython.float = 0, + pscore: cython.float = 0, + fold_change: cython.float = 0, + qscore: cython.float = 0, + name: bytes = b"NA"): """items: start:start end:end, @@ -161,154 +184,165 @@ def __init__ (self): qscore:qscore """ if not self.peaks.has_key(chromosome): - self.peaks[chromosome]=[] - self.peaks[chromosome].append(PeakContent( chromosome, start, end, summit, peak_score, pileup, pscore, fold_change, qscore, name)) + self.peaks[chromosome] = [] + self.peaks[chromosome].append(PeakContent(chromosome, + start, + end, + summit, + peak_score, + pileup, + pscore, + fold_change, + qscore, + name)) self.total += 1 self.CO_sorted = False - cpdef add_PeakContent ( self, bytes chromosome, object peakcontent ): + @cython.ccall + def add_PeakContent(self, + chromosome: bytes, + peakcontent: PeakContent): if not self.peaks.has_key(chromosome): - self.peaks[chromosome]=[] + self.peaks[chromosome] = [] self.peaks[chromosome].append(peakcontent) self.total += 1 self.CO_sorted = False - cpdef list get_data_from_chrom (self, bytes chrom): - if not self.peaks.has_key( chrom ): - self.peaks[chrom]= [] + @cython.ccall + def get_data_from_chrom(self, chrom: bytes) -> list: + if not self.peaks.has_key(chrom): + self.peaks[chrom] = [] return self.peaks[chrom] - cpdef set get_chr_names (self): + def get_chr_names(self) -> set: return set(sorted(self.peaks.keys())) - def sort ( self ): - cdef: - list chrs - bytes chrom + def sort(self): + chrs: list + chrom: bytes + # sort by position if self.CO_sorted: # if already sorted, quit return chrs = sorted(list(self.peaks.keys())) for chrom in sorted(chrs): - self.peaks[chrom].sort(key=lambda x:x['start']) + self.peaks[chrom].sort(key=lambda x: x['start']) self.CO_sorted = True return - cpdef object randomly_pick ( self, int n, int seed = 12345 ): + @cython.ccall + def randomly_pick(self, n: cython.int, seed: cython.int = 12345): """Shuffle the peaks and get n peaks out of it. Return a new PeakIO object. """ - cdef: - list all_pc - list chrs - bytes chrom - object ret_peakio, p + all_pc: list + chrs: list + chrom: bytes + ret_peakio: PeakIO + p: PeakContent + assert n > 0 chrs = sorted(list(self.peaks.keys())) all_pc = [] for chrom in sorted(chrs): all_pc.extend(self.peaks[chrom]) - random.seed( seed ) - random.shuffle( all_pc ) + random.seed(seed) + random.shuffle(all_pc) all_pc = all_pc[:n] ret_peakio = PeakIO() for p in all_pc: - ret_peakio.add_PeakContent ( p["chrom"], p ) + ret_peakio.add_PeakContent(p["chrom"], p) return ret_peakio - - cpdef void filter_pscore (self, double pscore_cut ): - cdef: - bytes chrom - dict new_peaks - list chrs - object p + + @cython.ccall + def filter_pscore(self, pscore_cut: cython.double): + chrom: bytes + new_peaks: dict + chrs: list + new_peaks = {} chrs = sorted(list(self.peaks.keys())) self.total = 0 for chrom in sorted(chrs): - new_peaks[chrom]=[p for p in self.peaks[chrom] if p['pscore'] >= pscore_cut] - self.total += len( new_peaks[chrom] ) + new_peaks[chrom] = [p for p in self.peaks[chrom] if p['pscore'] >= pscore_cut] + self.total += len(new_peaks[chrom]) self.peaks = new_peaks self.CO_sorted = True self.sort() - cpdef void filter_qscore (self, double qscore_cut ): - cdef: - bytes chrom - dict new_peaks - list chrs - object p + @cython.ccall + def filter_qscore(self, qscore_cut: cython.double): + chrom: bytes + new_peaks: dict + chrs: list new_peaks = {} chrs = sorted(list(self.peaks.keys())) self.total = 0 for chrom in sorted(chrs): - new_peaks[chrom]=[p for p in self.peaks[chrom] if p['qscore'] >= qscore_cut] - self.total += len( new_peaks[chrom] ) + new_peaks[chrom] = [p for p in self.peaks[chrom] if p['qscore'] >= qscore_cut] + self.total += len(new_peaks[chrom]) self.peaks = new_peaks self.CO_sorted = True self.sort() - cpdef void filter_fc (self, float fc_low, float fc_up = 0 ): + @cython.ccall + def filter_fc(self, fc_low: cython.float, fc_up: cython.float = 0): """Filter peaks in a given fc range. If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up) """ - cdef: - bytes chrom - dict new_peaks - list chrs - object p + chrom: bytes + new_peaks: dict + chrs: list new_peaks = {} chrs = list(self.peaks.keys()) self.total = 0 if fc_up > 0 and fc_up > fc_low: for chrom in sorted(chrs): - new_peaks[chrom]=[p for p in self.peaks[chrom] if p['fc'] >= fc_low and p['fc']= fc_low and p['fc'] < fc_up] + self.total += len(new_peaks[chrom]) else: for chrom in sorted(chrs): - new_peaks[chrom]=[p for p in self.peaks[chrom] if p['fc'] >= fc_low] - self.total += len( new_peaks[chrom] ) + new_peaks[chrom] = [p for p in self.peaks[chrom] if p['fc'] >= fc_low] + self.total += len(new_peaks[chrom]) self.peaks = new_peaks self.CO_sorted = True self.sort() - cpdef void filter_score (self, float lower_score, float upper_score = 0 ): + def filter_score(self, lower_score: cython.float, upper_score: cython.float = 0): """Filter peaks in a given score range. """ - cdef: - bytes chrom - dict new_peaks - list chrs - object p + chrom: bytes + new_peaks: dict + chrs: list new_peaks = {} chrs = list(self.peaks.keys()) self.total = 0 if upper_score > 0 and upper_score > lower_score: for chrom in sorted(chrs): - new_peaks[chrom]=[p for p in self.peaks[chrom] if p['score'] >= lower_score and p['score']= lower_score and p['score'] < upper_score] + self.total += len(new_peaks[chrom]) else: for chrom in sorted(chrs): - new_peaks[chrom]=[p for p in self.peaks[chrom] if p['score'] >= lower_score] - self.total += len( new_peaks[chrom] ) + new_peaks[chrom] = [p for p in self.peaks[chrom] if p['score'] >= lower_score] + self.total += len(new_peaks[chrom]) self.peaks = new_peaks self.CO_sorted = True self.sort() - def __str__ (self): + def __str__(self): """convert to text -- for debug """ - cdef: - list chrs - int n_peak - str ret + chrs: list + n_peak: cython.int + ret: str + ret = "" chrs = list(self.peaks.keys()) n_peak = 0 @@ -318,38 +352,44 @@ def __str__ (self): peaks = list(group) if len(peaks) > 1: for i, peak in enumerate(peaks): - ret += "chrom:%s\tstart:%d\tend:%d\tname:peak_%d%s\tscore:%.6g\tsummit:%d\n" % (chrom.decode(),peak['start'],peak['end'],n_peak,subpeak_letters(i),peak["score"],peak["summit"]) + ret += "chrom:%s\tstart:%d\tend:%d\tname:peak_%d%s\tscore:%.6g\tsummit:%d\n" % (chrom.decode(), peak['start'], peak['end'], n_peak, subpeak_letters(i), peak["score"], peak["summit"]) else: peak = peaks[0] - ret += "chrom:%s\tstart:%d\tend:%d\tname:peak_%d\tscore:%.6g\tsummit:%d\n" % (chrom.decode(),peak['start'],peak['end'],n_peak,peak["score"],peak["summit"]) - + ret += "chrom:%s\tstart:%d\tend:%d\tname:peak_%d\tscore:%.6g\tsummit:%d\n" % (chrom.decode(), peak['start'], peak['end'], n_peak, peak["score"], peak["summit"]) return ret - cdef void _to_bed(self, bytes name_prefix=b"%s_peak_", bytes name=b"MACS", - bytes description=b"%s", str score_column="score", - bool trackline=False, print_func=sys.stdout.write): + @cython.cfunc + def _to_bed(self, + name_prefix: bytes = b"%s_peak_", + name: bytes = b"MACS", + description: bytes = b"%s", + score_column: str = "score", + trackline: bool = False, + print_func=sys.stdout.write): """ generalization of tobed and write_to_bed """ - cdef: - list chrs - int n_peak - bytes peakprefix, desc + chrs: list + n_peak: cython.int + peakprefix: bytes + desc: bytes + chrs = list(self.peaks.keys()) n_peak = 0 try: peakprefix = name_prefix % name - except: + except Exception: peakprefix = name_prefix try: desc = description % name - except: + except Exception: desc = description + if trackline: try: - print_func('track name="%s (peaks)" description="%s" visibility=1\n' % ( name.replace(b"\"", b"\\\"").decode(), - desc.replace(b"\"", b"\\\"").decode() ) ) - except: + print_func('track name="%s (peaks)" description="%s" visibility=1\n' % (name.replace(b"\"", b"\\\"").decode(), + desc.replace(b"\"", b"\\\"").decode())) + except Exception: print_func('track name=MACS description=Unknown\n') for chrom in sorted(chrs): for end, group in groupby(self.peaks[chrom], key=itemgetter("end")): @@ -357,27 +397,43 @@ def __str__ (self): peaks = list(group) if len(peaks) > 1: for i, peak in enumerate(peaks): - print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(),peak['start'],peak['end'],peakprefix.decode(),n_peak,subpeak_letters(i),peak[score_column])) + print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(), peak['start'], peak['end'], peakprefix.decode(), n_peak, subpeak_letters(i), peak[score_column])) else: peak = peaks[0] - print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(),peak['start'],peak['end'],peakprefix.decode(),n_peak,peak[score_column])) - - cdef _to_summits_bed(self, bytes name_prefix=b"%s_peak_", bytes name=b"MACS", - bytes description = b"%s", str score_column="score", - bool trackline=False, print_func=sys.stdout.write): + print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(), peak['start'], peak['end'], peakprefix.decode(), n_peak, peak[score_column])) + + @cython.cfunc + def _to_summits_bed(self, + name_prefix: bytes = b"%s_peak_", + name: bytes = b"MACS", + description: bytes = b"%s", + score_column: str = "score", + trackline: bool = False, + print_func=sys.stdout.write): """ generalization of to_summits_bed and write_to_summit_bed """ + chrs: list + n_peak: cython.int + peakprefix: bytes + desc: bytes + chrs = list(self.peaks.keys()) n_peak = 0 - try: peakprefix = name_prefix % name - except: peakprefix = name_prefix - try: desc = description % name - except: desc = description + try: + peakprefix = name_prefix % name + except Exception: + peakprefix = name_prefix + try: + desc = description % name + except Exception: + desc = description if trackline: - try: print_func('track name="%s (summits)" description="%s" visibility=1\n' % ( name.replace(b"\"", b"\\\"").decode(),\ - desc.replace(b"\"", b"\\\"").decode() ) ) - except: print_func('track name=MACS description=Unknown') + try: + print_func('track name="%s (summits)" description="%s" visibility=1\n' % (name.replace(b"\"", b"\\\"").decode(), + desc.replace(b"\"", b"\\\"").decode())) + except Exception: + print_func('track name=MACS description=Unknown') for chrom in sorted(chrs): for end, group in groupby(self.peaks[chrom], key=itemgetter("end")): n_peak += 1 @@ -385,13 +441,13 @@ def __str__ (self): if len(peaks) > 1: for i, peak in enumerate(peaks): summit_p = peak['summit'] - print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(),summit_p,summit_p+1,peakprefix.decode(),n_peak,subpeak_letters(i),peak[score_column])) + print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(), summit_p, summit_p+1, peakprefix.decode(), n_peak, subpeak_letters(i), peak[score_column])) else: peak = peaks[0] summit_p = peak['summit'] - print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(),summit_p,summit_p+1,peakprefix.decode(),n_peak,peak[score_column])) + print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(), summit_p, summit_p+1, peakprefix.decode(), n_peak, peak[score_column])) - def tobed (self): + def tobed(self): """Print out peaks in BED5 format. Five columns are chromosome, peak start, peak end, peak name, and peak height. @@ -408,7 +464,7 @@ def tobed (self): """ return self._to_bed(name_prefix=b"peak_", score_column="score", name=b"", description=b"") - def to_summits_bed (self): + def to_summits_bed(self): """Print out peak summits in BED5 format. Five columns are chromosome, summit start, summit end, peak name, and peak height. @@ -417,8 +473,12 @@ def to_summits_bed (self): return self._to_summits_bed(name_prefix=b"peak_", score_column="score", name=b"", description=b"") # these methods are very fast, specifying types is unnecessary - def write_to_bed (self, fhd, bytes name_prefix=b"peak_", bytes name=b"MACS", - bytes description = b"%s", str score_column="score", trackline=True): + def write_to_bed(self, fhd, + name_prefix: bytes = b"peak_", + name: bytes = b"MACS", + description: bytes = b"%s", + score_column: str = "score", + trackline: bool = True): """Write peaks in BED5 format in a file handler. Score (5th column) is decided by score_column setting. Check the following list. Name column ( 4th column) is made by putting @@ -439,13 +499,20 @@ def write_to_bed (self, fhd, bytes name_prefix=b"peak_", bytes name=b"MACS", fc:fold_change, qscore:qvalue """ - #print(description) - return self._to_bed(name_prefix=name_prefix, name=name, - description=description, score_column=score_column, - print_func=fhd.write, trackline=trackline) - - def write_to_summit_bed (self, fhd, bytes name_prefix = b"peak_", bytes name = b"MACS", - bytes description = b"%s", str score_column ="score", trackline=True): + # print(description) + return self._to_bed(name_prefix=name_prefix, + name=name, + description=description, + score_column=score_column, + print_func=fhd.write, + trackline=trackline) + + def write_to_summit_bed(self, fhd, + name_prefix: bytes = b"%s_peak_", + name: bytes = b"MACS", + description: bytes = b"%s", + score_column: str = "score", + trackline: bool = False): """Write peak summits in BED5 format in a file handler. Score (5th column) is decided by score_column setting. Check the following list. Name column ( 4th column) is made by putting @@ -469,7 +536,11 @@ def write_to_summit_bed (self, fhd, bytes name_prefix = b"peak_", bytes name = b description=description, score_column=score_column, print_func=fhd.write, trackline=trackline) - def write_to_narrowPeak (self, fhd, bytes name_prefix = b"peak_", bytes name = b"peak", str score_column="score", trackline=True): + def write_to_narrowPeak(self, fhd, + name_prefix: bytes = b"%s_peak_", + name: bytes = b"peak", + score_column: str = "score", + trackline: bool = False): """Print out peaks in narrowPeak format. This format is designed for ENCODE project, and basically a @@ -523,33 +594,41 @@ def write_to_narrowPeak (self, fhd, bytes name_prefix = b"peak_", bytes name = b +-----------+------+----------------------------------------+ """ - cdef int n_peak - cdef bytes chrom - cdef long s - cdef str peakname + n_peak: cython.int + chrom: bytes + s: cython.long + peakname: str chrs = list(self.peaks.keys()) n_peak = 0 write = fhd.write - try: peakprefix = name_prefix % name - except: peakprefix = name_prefix + try: + peakprefix = name_prefix % name + except Exception: + peakprefix = name_prefix if trackline: write("track type=narrowPeak name=\"%s\" description=\"%s\" nextItemButton=on\n" % (name.decode(), name.decode())) for chrom in sorted(chrs): for end, group in groupby(self.peaks[chrom], key=itemgetter("end")): n_peak += 1 these_peaks = list(group) - if len(these_peaks) > 1: # from call-summits + if len(these_peaks) > 1: # from call-summits for i, peak in enumerate(these_peaks): peakname = "%s%d%s" % (peakprefix.decode(), n_peak, subpeak_letters(i)) if peak['summit'] == -1: s = -1 else: s = peak['summit'] - peak['start'] - fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n" - % - (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]), - peak['fc'],peak['pscore'],peak['qscore'],s) ) + fhd.write("%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n" % + (chrom.decode(), + peak['start'], + peak['end'], + peakname, + int(10*peak[score_column]), + peak['fc'], + peak['pscore'], + peak['qscore'], + s)) else: peak = these_peaks[0] peakname = "%s%d" % (peakprefix.decode(), n_peak) @@ -557,13 +636,22 @@ def write_to_narrowPeak (self, fhd, bytes name_prefix = b"peak_", bytes name = b s = -1 else: s = peak['summit'] - peak['start'] - fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n" - % - (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]), - peak['fc'],peak['pscore'],peak['qscore'],s) ) + fhd.write("%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n" % + (chrom.decode(), + peak['start'], + peak['end'], + peakname, + int(10*peak[score_column]), + peak['fc'], + peak['pscore'], + peak['qscore'], + s)) return - def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MACS"): + @cython.ccall + def write_to_xls(self, ofhd, + name_prefix: bytes = b"%s_peak_", + name: bytes = b"MACS"): """Save the peak results in a tab-delimited plain text file with suffix .xls. @@ -571,11 +659,19 @@ def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MA wait... why I have two write_to_xls in this class? """ + peakprefix: bytes + chrs: list + these_peaks: list + n_peak: cython.int + i: cython.int + write = ofhd.write - write("\t".join(("chr","start", "end", "length", "abs_summit", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n") + write("\t".join(("chr", "start", "end", "length", "abs_summit", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n") - try: peakprefix = name_prefix % name - except: peakprefix = name_prefix + try: + peakprefix = name_prefix % name + except Exception: + peakprefix = name_prefix peaks = self.peaks chrs = list(peaks.keys()) @@ -587,47 +683,56 @@ def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MA if len(these_peaks) > 1: for i, peak in enumerate(these_peaks): peakname = "%s%d%s" % (peakprefix.decode(), n_peak, subpeak_letters(i)) - #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue] - write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length'])) - write("\t%d" % (peak['summit']+1)) # summit position - write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit - write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit - write("\t%.6g" % (peak['fc'])) # fold change at summit - write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit + # [start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue] + write("%s\t%d\t%d\t%d" % (chrom.decode(), + peak['start']+1, + peak['end'], + peak['length'])) + write("\t%d" % (peak['summit']+1)) # summit position + write("\t%.6g" % (round(peak['pileup'], 2))) # pileup height at summit + write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit + write("\t%.6g" % (peak['fc'])) # fold change at summit + write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit write("\t%s" % peakname) write("\n") else: peak = these_peaks[0] peakname = "%s%d" % (peakprefix.decode(), n_peak) - #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue] - write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length'])) - write("\t%d" % (peak['summit']+1)) # summit position - write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit - write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit - write("\t%.6g" % (peak['fc'])) # fold change at summit - write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit + # [start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue] + write("%s\t%d\t%d\t%d" % (chrom.decode(), + peak['start']+1, + peak['end'], + peak['length'])) + write("\t%d" % (peak['summit']+1)) # summit position + write("\t%.6g" % (round(peak['pileup'], 2))) # pileup height at summit + write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit + write("\t%.6g" % (peak['fc'])) # fold change at summit + write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit write("\t%s" % peakname) write("\n") return - - cpdef void exclude (self, object peaksio2): + @cython.ccall + def exclude(self, peaksio2: object): """ Remove overlapping peaks in peaksio2, another PeakIO object. """ - cdef: - dict peaks1, peaks2 - list chrs1, chrs2 - bytes k - dict ret_peaks - bool overlap_found - object r1, r2 # PeakContent objects - long n_rl1, n_rl2 + peaks1: dict + peaks2: dict + chrs1: list + chrs2: list + k: bytes + ret_peaks: dict + overlap_found: bool + r1: PeakContent + r2: PeakContent + n_rl1: cython.long + n_rl2: cython.long self.sort() peaks1 = self.peaks self.total = 0 - assert isinstance(peaksio2,PeakIO) + assert isinstance(peaksio2, PeakIO) peaksio2.sort() peaks2 = peaksio2.peaks @@ -638,44 +743,44 @@ def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MA #print(f"chromosome {k}") if not chrs2.count(k): # no such chromosome in peaks1, then don't touch the peaks in this chromosome - ret_peaks[ k ] = peaks1[ k ] + ret_peaks[k] = peaks1[k] continue - ret_peaks[ k ] = [] - n_rl1 = len( peaks1[k] ) - n_rl2 = len( peaks2[k] ) - rl1_k = iter( peaks1[k] ).__next__ - rl2_k = iter( peaks2[k] ).__next__ + ret_peaks[k] = [] + n_rl1 = len(peaks1[k]) + n_rl2 = len(peaks2[k]) + rl1_k = iter(peaks1[k]).__next__ + rl2_k = iter(peaks2[k]).__next__ overlap_found = False r1 = rl1_k() n_rl1 -= 1 r2 = rl2_k() n_rl2 -= 1 - while ( True ): + while (True): # we do this until there is no r1 or r2 left. if r2["start"] < r1["end"] and r1["start"] < r2["end"]: # since we found an overlap, r1 will be skipped/excluded # and move to the next r1 overlap_found = True - #print(f"found overlap of {r1['start']} {r1['end']} and {r2['start']} {r2['end']}, move to the next r1") + # print(f"found overlap of {r1['start']} {r1['end']} and {r2['start']} {r2['end']}, move to the next r1") n_rl1 -= 1 if n_rl1 >= 0: r1 = rl1_k() - #print(f"move to next r1 {r1['start']} {r1['end']}") + # print(f"move to next r1 {r1['start']} {r1['end']}") overlap_found = False continue else: break if r1["end"] < r2["end"]: - #print(f"now we need to move r1 {r1['start']} {r1['end']}") + # print(f"now we need to move r1 {r1['start']} {r1['end']}") # in this case, we need to move to the next r1, # we will check if overlap_found is true, if not, we put r1 in a new dict if not overlap_found: - #print(f"we add this r1 {r1['start']} {r1['end']} to list") - ret_peaks[ k ].append( r1 ) + # print(f"we add this r1 {r1['start']} {r1['end']} to list") + ret_peaks[k].append(r1) n_rl1 -= 1 if n_rl1 >= 0: r1 = rl1_k() - #print(f"move to next r1 {r1['start']} {r1['end']}") + # print(f"move to next r1 {r1['start']} {r1['end']}") overlap_found = False else: # no more r1 left @@ -685,54 +790,61 @@ def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MA if n_rl2: r2 = rl2_k() n_rl2 -= 1 - #print(f"move to next r2 {r2['start']} {r2['end']}") + # print(f"move to next r2 {r2['start']} {r2['end']}") else: # no more r2 left break # add the rest of r1 - #print( f"n_rl1: {n_rl1} n_rl2:{n_rl2} last overlap_found is {overlap_found}" ) - #if overlap_found: + # print( f"n_rl1: {n_rl1} n_rl2:{n_rl2} last overlap_found is {overlap_found}" ) + # if overlap_found: # n_rl1 -= 1 if n_rl1 >= 0: - ret_peaks[ k ].extend( peaks1[ k ][-n_rl1-1:] ) + ret_peaks[k].extend(peaks1[k][-n_rl1-1:]) for k in ret_peaks.keys(): - self.total += len( ret_peaks[ k ] ) + self.total += len(ret_peaks[k]) self.peaks = ret_peaks self.CO_sorted = True - self.sort() + self.sort() return - def read_from_xls (self, ofhd): + @cython.ccall + def read_from_xls(self, ofhd): """Save the peak results in a tab-delimited plain text file with suffix .xls. """ - cdef: - bytes line = b'' - bytes chrom = b'' - int n_peak = 0 - int start, end, length, summit - float pileup, pscore, fc, qscore - list fields + line: bytes = b'' + chrom: bytes = b'' + start: cython.int + end: cython.int + length: cython.int + summit: cython.int + pileup: cython.float + pscore: cython.float + fc: cython.float + qscore: cython.float + fields: list + while True: - if not (line.startswith('#') or line.strip() == ''): break + if not (line.startswith('#') or line.strip() == ''): + break line = ofhd.readline() # sanity check columns = line.rstrip().split('\t') - for a,b in zip(columns, ("chr","start", "end", "length", "abs_summit", - "pileup", "-log10(pvalue)", "fold_enrichment", - "-log10(qvalue)", "name")): - if not a==b: raise NotImplementedError('column %s not recognized', a) + for a, b in zip(columns, ("chr", "start", "end", "length", "abs_summit", + "pileup", "-log10(pvalue)", "fold_enrichment", + "-log10(qvalue)", "name")): + if not a == b: + raise NotImplementedError('column %s not recognized', a) add = self.add split = str.split rstrip = str.rstrip for i, line in enumerate(ofhd.readlines()): fields = split(line, '\t') - peak = {} chrom = fields[0].encode() start = int(fields[1]) - 1 end = int(fields[2]) @@ -748,68 +860,62 @@ def read_from_xls (self, ofhd): add(chrom, start, end, summit, qscore, pileup, pscore, fc, qscore, peakname) -cpdef parse_peakname(peakname): - """returns peaknumber, subpeak - """ - cdef: - bytes peak_id, peaknumber, subpeak - peak_id = peakname.split(b'_')[-1] - x = re.split('(\D.*)', peak_id) - peaknumber = int(x[0]) - try: - subpeak = x[1] - except IndexError: - subpeak = b'' - return (peaknumber, subpeak) - -cdef class RegionIO: + +@cython.cclass +class RegionIO: """For plain region of chrom, start and end """ - cdef: - dict regions - bool __flag_sorted + regions: dict + __flag_sorted: bool - def __init__ (self): - self.regions= {} + def __init__(self): + self.regions = {} self.__flag_sorted = False - cpdef void add_loc ( self, bytes chrom, int start, int end ): + @cython.ccall + def add_loc(self, chrom: bytes, start: cython.int, end: cython.int): if self.regions.has_key(chrom): - self.regions[chrom].append( (start,end) ) + self.regions[chrom].append((start, end)) else: - self.regions[chrom] = [(start,end), ] + self.regions[chrom] = [(start, end), ] self.__flag_sorted = False return - cpdef void sort (self): - cdef bytes chrom + @cython.ccall + def sort(self): + chrom: bytes for chrom in sorted(list(self.regions.keys())): self.regions[chrom].sort() self.__flag_sorted = True - cpdef set get_chr_names (self): + @cython.ccall + def get_chr_names(self) -> set: return set(sorted(self.regions.keys())) - cpdef void merge_overlap ( self ): + @cython.ccall + def merge_overlap(self): """ merge overlapping regions """ - cdef: - bytes chrom - int s_new_region, e_new_region, i, j - dict regions, new_regions - list chrs, regions_chr - tuple prev_region + chrom: bytes + s_new_region: cython.int + e_new_region: cython.int + i: cython.int + regions: dict + new_regions: dict + chrs: list + regions_chr: list + prev_region: tuple if not self.__flag_sorted: self.sort() regions = self.regions new_regions = {} - chrs = sorted( list( regions.keys() ) ) - for i in range( len( chrs ) ): + chrs = sorted(list(regions.keys())) + for i in range(len(chrs)): chrom = chrs[i] - new_regions[chrom]=[] + new_regions[chrom] = [] n_append = new_regions[chrom].append prev_region = None regions_chr = regions[chrom] @@ -821,7 +927,7 @@ def __init__ (self): if regions_chr[i][0] <= prev_region[1]: s_new_region = prev_region[0] e_new_region = regions_chr[i][1] - prev_region = (s_new_region,e_new_region) + prev_region = (s_new_region, e_new_region) else: n_append(prev_region) prev_region = regions_chr[i] @@ -831,43 +937,53 @@ def __init__ (self): self.sort() return - cpdef write_to_bed (self, fhd ): - cdef: - int i - bytes chrom - list chrs - tuple region + @cython.ccall + def write_to_bed(self, fhd): + i: cython.int + chrom: bytes + chrs: list + region: tuple chrs = sorted(list(self.regions.keys())) - for i in range( len(chrs) ): + for i in range(len(chrs)): chrom = chrs[i] for region in self.regions[chrom]: - fhd.write( "%s\t%d\t%d\n" % (chrom.decode(),region[0],region[1] ) ) - - -cdef class BroadPeakContent: - cdef: - long start - long end - long length - float score - bytes thickStart - bytes thickEnd - long blockNum - bytes blockSizes - bytes blockStarts - float pileup - float pscore - float fc - float qscore - bytes name - - def __init__ ( self, long start, long end, float score, - bytes thickStart, bytes thickEnd, - long blockNum, bytes blockSizes, - bytes blockStarts, float pileup, - float pscore, float fold_change, - float qscore, bytes name = b"NA" ): + fhd.write("%s\t%d\t%d\n" % (chrom.decode(), + region[0], + region[1])) + + +@cython.cclass +class BroadPeakContent: + start: cython.int + end: cython.int + length: cython.int + score: cython.float + thickStart: bytes + thickEnd: bytes + blockNum: cython.int + blockSizes: bytes + blockStarts: bytes + pileup: cython.float + pscore: cython.float + fc: cython.float + qscore: cython.float + name: bytes + + def __init__(self, + start: cython.int, + end: cython.int, + score: cython.float, + thickStart: bytes, + thickEnd: bytes, + blockNum: cython.int, + blockSizes: bytes, + blockStarts: bytes, + pileup: cython.float, + pscore: cython.float, + fold_change: cython.float, + qscore: cython.float, + name: bytes = b"NA"): self.start = start self.end = end self.score = score @@ -876,7 +992,6 @@ def __init__ ( self, long start, long end, float score, self.blockNum = blockNum self.blockSizes = blockSizes self.blockStarts = blockStarts - self.length = end - start self.pileup = pileup self.pscore = pscore @@ -884,7 +999,7 @@ def __init__ ( self, long start, long end, float score, self.qscore = qscore self.name = name - def __getitem__ ( self, a ): + def __getitem__(self, a): if a == "start": return self.start elif a == "end": @@ -914,26 +1029,36 @@ def __getitem__ ( self, a ): elif a == "name": return self.name - def __str__ (self): - return "start:%d;end:%d;score:%f" % ( self.start, self.end, self.score ) + def __str__(self): + return "start:%d;end:%d;score:%f" % (self.start, self.end, self.score) -cdef class BroadPeakIO: +@cython.cclass +class BroadPeakIO: """IO for broad peak information. """ - cdef: - dict peaks + peaks: dict - def __init__ (self): + def __init__(self): self.peaks = {} - def add (self, char * chromosome, long start, long end, long score = 0, - bytes thickStart=b".", bytes thickEnd=b".", - long blockNum=0, bytes blockSizes=b".", - bytes blockStarts=b".", float pileup = 0, - float pscore = 0, float fold_change = 0, - float qscore = 0, bytes name = b"NA" ): + @cython.ccall + def add(self, + chromosome: bytes, + start: cython.int, + end: cython.int, + score: cython.float = 0.0, + thickStart: bytes = b".", + thickEnd: bytes = b".", + blockNum: cython.int = 0, + blockSizes: bytes = b".", + blockStarts: bytes = b".", + pileup: cython.float = 0, + pscore: cython.float = 0, + fold_change: cython.float = 0, + qscore: cython.float = 0, + name: bytes = b"NA"): """items chromosome : chromosome name, start : broad region start, @@ -952,81 +1077,97 @@ def add (self, char * chromosome, long start, long end, long score = 0, """ if not self.peaks.has_key(chromosome): self.peaks[chromosome] = [] - self.peaks[chromosome].append( BroadPeakContent( start, end, score, thickStart, thickEnd, - blockNum, blockSizes, blockStarts, - pileup, pscore, fold_change, qscore, name ) ) - - def filter_pscore (self, double pscore_cut ): - cdef: - bytes chrom - dict peaks - dict new_peaks - list chrs - BroadPeakContent p + self.peaks[chromosome].append(BroadPeakContent(start, + end, + score, + thickStart, + thickEnd, + blockNum, + blockSizes, + blockStarts, + pileup, + pscore, + fold_change, + qscore, + name)) + + @cython.ccall + def filter_pscore(self, pscore_cut: cython.float): + chrom: bytes + peaks: dict + new_peaks: dict + chrs: list peaks = self.peaks new_peaks = {} chrs = list(peaks.keys()) for chrom in sorted(chrs): - new_peaks[chrom]=[p for p in peaks[chrom] if p['pscore'] >= pscore_cut] + new_peaks[chrom] = [p for p in peaks[chrom] if p['pscore'] >= pscore_cut] self.peaks = new_peaks - def filter_qscore (self, double qscore_cut ): - cdef: - bytes chrom - dict peaks - dict new_peaks - list chrs - BroadPeakContent p + @cython.ccall + def filter_qscore(self, qscore_cut: cython.float): + chrom: bytes + peaks: dict + new_peaks: dict + chrs: list peaks = self.peaks new_peaks = {} chrs = list(peaks.keys()) for chrom in sorted(chrs): - new_peaks[chrom]=[p for p in peaks[chrom] if p['qscore'] >= qscore_cut] + new_peaks[chrom] = [p for p in peaks[chrom] if p['qscore'] >= qscore_cut] self.peaks = new_peaks - def filter_fc (self, fc_low, fc_up=None ): + @cython.ccall + def filter_fc(self, fc_low: float, fc_up: float = -1): """Filter peaks in a given fc range. - If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up) + If fc_low and fc_up is assigned, the peaks with fc in + [fc_low,fc_up) + + fc_up has to be a positive number, otherwise it won't be + applied. """ - cdef: - bytes chrom - dict peaks - dict new_peaks - list chrs - BroadPeakContent p + chrom: bytes + peaks: dict + new_peaks: dict + chrs: list peaks = self.peaks new_peaks = {} chrs = list(peaks.keys()) - if fc_up: + if fc_up >= 0: for chrom in sorted(chrs): - new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low and p['fc']= fc_low and p['fc'] < fc_up] else: for chrom in sorted(chrs): - new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low] + new_peaks[chrom] = [p for p in peaks[chrom] if p['fc'] >= fc_low] self.peaks = new_peaks - def total (self): - cdef: - bytes chrom - dict peaks - list chrs - long x + @cython.ccall + def total(self): + chrom: bytes + peaks: dict + chrs: list + x: cython.long = 0 peaks = self.peaks chrs = list(peaks.keys()) - x = 0 for chrom in sorted(chrs): x += len(peaks[chrom]) return x - def write_to_gappedPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True): + @cython.ccall + def write_to_gappedPeak(self, fhd, + name_prefix: bytes = b"peak_", + name: bytes = b'peak', + description: bytes = b"%s", + score_column: str = "score", + trackline: bool = True): """Print out peaks in gappedBed format. Only those with stronger enrichment regions are saved. This format is basically a BED12+3 format. @@ -1095,24 +1236,49 @@ def write_to_gappedPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'pea +--------------+------+----------------------------------------+ """ + chrs: list + n_peak: cython.int = 0 + peak: BroadPeakContent + desc: bytes + peakprefix: bytes + chrom: bytes + chrs = list(self.peaks.keys()) - n_peak = 0 - try: peakprefix = name_prefix % name - except: peakprefix = name_prefix - try: desc = description % name - except: desc = description + try: + peakprefix = name_prefix % name + except Exception: + peakprefix = name_prefix + try: + desc = description % name + except Exception: + desc = description if trackline: - fhd.write("track name=\"%s\" description=\"%s\" type=gappedPeak nextItemButton=on\n" % (name.decode(), desc.decode()) ) + fhd.write("track name=\"%s\" description=\"%s\" type=gappedPeak nextItemButton=on\n" % (name.decode(), desc.decode())) for chrom in sorted(chrs): for peak in self.peaks[chrom]: n_peak += 1 if peak["thickStart"] != b".": - fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t0\t0\t0\t%d\t%s\t%s\t%.6g\t%.6g\t%.6g\n" - % - (chrom.decode(),peak["start"],peak["end"],peakprefix.decode(),n_peak,int(10*peak[score_column]), - peak["blockNum"],peak["blockSizes"].decode(),peak["blockStarts"].decode(), peak['fc'], peak['pscore'], peak['qscore'] ) ) - - def write_to_Bed12 (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True): + fhd.write("%s\t%d\t%d\t%s%d\t%d\t.\t0\t0\t0\t%d\t%s\t%s\t%.6g\t%.6g\t%.6g\n" % + (chrom.decode(), + peak["start"], + peak["end"], + peakprefix.decode(), + n_peak, + int(10*peak[score_column]), + peak["blockNum"], + peak["blockSizes"].decode(), + peak["blockStarts"].decode(), + peak['fc'], + peak['pscore'], + peak['qscore'])) + + @cython.ccall + def write_to_Bed12(self, fhd, + name_prefix: bytes = b"peak_", + name: bytes = b'peak', + description: bytes = b"%s", + score_column: str = "score", + trackline: bool = True): """Print out peaks in Bed12 format. +--------------+------+----------------------------------------+ @@ -1167,31 +1333,58 @@ def write_to_Bed12 (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', b +--------------+------+----------------------------------------+ """ + chrs: list + n_peak: cython.int = 0 + peakprefix: bytes + peak: BroadPeakContent + desc: bytes + peakprefix: bytes + chrom: bytes + chrs = list(self.peaks.keys()) - n_peak = 0 - try: peakprefix = name_prefix % name - except: peakprefix = name_prefix - try: desc = description % name - except: desc = description + try: + peakprefix = name_prefix % name + except Exception: + peakprefix = name_prefix + try: + desc = description % name + except Exception: + desc = description if trackline: - fhd.write("track name=\"%s\" description=\"%s\" type=bed nextItemButton=on\n" % (name.decode(), desc.decode()) ) + fhd.write("track name=\"%s\" description=\"%s\" type=bed nextItemButton=on\n" % (name.decode(), desc.decode())) for chrom in sorted(chrs): for peak in self.peaks[chrom]: n_peak += 1 if peak["thickStart"] == b".": # this will violate gappedPeak format, since it's a complement like broadPeak line. - fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\n" - % - (chrom.decode(),peak["start"],peak["end"],peakprefix.decode(),n_peak,int(10*peak[score_column]) ) ) + fhd.write("%s\t%d\t%d\t%s%d\t%d\t.\n" % + (chrom.decode(), + peak["start"], + peak["end"], + peakprefix.decode(), + n_peak, + int(10*peak[score_column]))) else: - fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t%s\t%s\t0\t%d\t%s\t%s\n" - % - (chrom.decode(), peak["start"], peak["end"], peakprefix.decode(), n_peak, int(10*peak[score_column]), - peak["thickStart"].decode(), peak["thickEnd"].decode(), - peak["blockNum"], peak["blockSizes"].decode(), peak["blockStarts"].decode() )) - - - def write_to_broadPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True): + fhd.write("%s\t%d\t%d\t%s%d\t%d\t.\t%s\t%s\t0\t%d\t%s\t%s\n" % + (chrom.decode(), + peak["start"], + peak["end"], + peakprefix.decode(), + n_peak, + int(10*peak[score_column]), + peak["thickStart"].decode(), + peak["thickEnd"].decode(), + peak["blockNum"], + peak["blockSizes"].decode(), + peak["blockStarts"].decode())) + + @cython.ccall + def write_to_broadPeak(self, fhd, + name_prefix: bytes = b"peak_", + name: bytes = b'peak', + description: bytes = b"%s", + score_column: str = "score", + trackline: bool = True): """Print out peaks in broadPeak format. This format is designed for ENCODE project, and basically a @@ -1241,16 +1434,20 @@ def write_to_broadPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak +-----------+------+----------------------------------------+ """ - cdef int n_peak - cdef bytes chrom - cdef long s - cdef str peakname + chrs: list + n_peak: cython.int = 0 + peakprefix: bytes + peak: BroadPeakContent + peakprefix: bytes + chrom: bytes + peakname: str chrs = list(self.peaks.keys()) - n_peak = 0 write = fhd.write - try: peakprefix = name_prefix % name - except: peakprefix = name_prefix + try: + peakprefix = name_prefix % name + except Exception: + peakprefix = name_prefix if trackline: write("track type=broadPeak name=\"%s\" description=\"%s\" nextItemButton=on\n" % (name.decode(), name.decode())) for chrom in sorted(chrs): @@ -1259,13 +1456,21 @@ def write_to_broadPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak these_peaks = list(group) peak = these_peaks[0] peakname = "%s%d" % (peakprefix.decode(), n_peak) - fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\n" % - (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]), - peak['fc'],peak['pscore'],peak['qscore'] ) ) + fhd.write("%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\n" % + (chrom.decode(), + peak['start'], + peak['end'], + peakname, + int(10*peak[score_column]), + peak['fc'], + peak['pscore'], + peak['qscore'])) return - - def write_to_xls (self, ofhd, bytes name_prefix=b"%s_peak_", bytes name=b"MACS"): + @cython.ccall + def write_to_xls(self, ofhd, + name_prefix: bytes = b"%s_peak_", + name: bytes = b"MACS"): """Save the peak results in a tab-delimited plain text file with suffix .xls. @@ -1273,11 +1478,21 @@ def write_to_xls (self, ofhd, bytes name_prefix=b"%s_peak_", bytes name=b"MACS") wait... why I have two write_to_xls in this class? """ + chrom: bytes + chrs: list + peakprefix: bytes + peaks: dict + these_peaks: list + peak: BroadPeakContent + peakname: str + write = ofhd.write - write("\t".join(("chr","start", "end", "length", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n") + write("\t".join(("chr", "start", "end", "length", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n") - try: peakprefix = name_prefix % name - except: peakprefix = name_prefix + try: + peakprefix = name_prefix % name + except Exception: + peakprefix = name_prefix peaks = self.peaks chrs = list(peaks.keys()) @@ -1288,11 +1503,14 @@ def write_to_xls (self, ofhd, bytes name_prefix=b"%s_peak_", bytes name=b"MACS") these_peaks = list(group) peak = these_peaks[0] peakname = "%s%d" % (peakprefix.decode(), n_peak) - write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length'])) - write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit - write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit - write("\t%.6g" % (peak['fc'])) # fold change at summit - write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit + write("%s\t%d\t%d\t%d" % (chrom.decode(), + peak['start']+1, + peak['end'], + peak['length'])) + write("\t%.6g" % (round(peak['pileup'], 2))) # pileup height at summit + write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit + write("\t%.6g" % (peak['fc'])) # fold change at summit + write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit write("\t%s" % peakname) write("\n") return diff --git a/MACS3/Signal/CallPeakUnit.pyx b/MACS3/Signal/CallPeakUnit.pyx index c6ffb7b8..c83aba7e 100644 --- a/MACS3/Signal/CallPeakUnit.pyx +++ b/MACS3/Signal/CallPeakUnit.pyx @@ -1,7 +1,7 @@ # cython: language_level=3 # cython: profile=True # cython: linetrace=True -# Time-stamp: <2022-09-15 17:06:17 Tao Liu> +# Time-stamp: <2024-10-10 16:45:01 Tao Liu> """Module for Calculate Scores. @@ -46,7 +46,7 @@ from libc.math cimport exp,log,log10, M_LN10, log1p, erf, sqrt, floor, ceil # MACS3 modules # ------------------------------------ from MACS3.Signal.SignalProcessing import maxima, enforce_valleys, enforce_peakyness -from MACS3.IO.PeakIO import PeakIO, BroadPeakIO, parse_peakname +from MACS3.IO.PeakIO import PeakIO, BroadPeakIO from MACS3.Signal.FixWidthTrack import FWTrack from MACS3.Signal.PairedEndTrack import PETrackI from MACS3.Signal.Prob import poisson_cdf diff --git a/MACS3/Signal/PairedEndTrack.py b/MACS3/Signal/PairedEndTrack.py new file mode 100644 index 00000000..8273495a --- /dev/null +++ b/MACS3/Signal/PairedEndTrack.py @@ -0,0 +1,730 @@ +# cython: language_level=3 +# cython: profile=True +# Time-stamp: <2024-10-10 17:03:45 Tao Liu> + +"""Module for filter duplicate tags from paired-end data + +This code is free software; you can redistribute it and/or modify it +under the terms of the BSD License (see the file LICENSE included with +the distribution). +""" + +# ------------------------------------ +# Python modules +# ------------------------------------ +import io +import sys +from array import array as pyarray +from collections import Counter + +# ------------------------------------ +# MACS3 modules +# ------------------------------------ +from MACS3.Signal.Pileup import (quick_pileup, + over_two_pv_array, + se_all_in_one_pileup) +from MACS3.Signal.BedGraph import bedGraphTrackI +from MACS3.Signal.PileupV2 import pileup_from_LR_hmmratac +# ------------------------------------ +# Other modules +# ------------------------------------ +import cython +import numpy as np +import cython.cimports.numpy as cnp +from cython.cimports.cpython import bool +from cython.cimports.libc.stdint import INT32_MAX as INT_MAX + +from MACS3.Utilities.Logger import logging + +logger = logging.getLogger(__name__) +debug = logger.debug +info = logger.info + +# Let numpy enforce PE-ness using ndarray, gives bonus speedup when sorting +# PE data doesn't have strandedness + + +@cython.cclass +class PETrackI: + """Paired End Locations Track class I along the whole genome + (commonly with the same annotation type), which are stored in a + dict. + + Locations are stored and organized by sequence names (chr names) in a + dict. They can be sorted by calling self.sort() function. + """ + __locations = cython.declare(dict, visibility="public") + __size = cython.declare(dict, visibility="public") + __buf_size = cython.declare(dict, visibility="public") + __sorted = cython.declare(bool, visibility="public") + total = cython.declare(cython.ulong, visibility="public") + annotation = cython.declare(str, visibility="public") + rlengths = cython.declare(dict, visibility="public") + buffer_size = cython.declare(cython.long, visibility="public") + length = cython.declare(cython.long, visibility="public") + average_template_length = cython.declare(cython.float, visibility="public") + __destroyed: bool + + def __init__(self, anno: str = "", buffer_size: cython.long = 100000): + """fw is the fixed-width for all locations. + + """ + # dictionary with chrname as key, nparray with + # [('l','int32'),('r','int32')] as value + self.__locations = {} + # dictionary with chrname as key, size of the above nparray as value + self.__size = {} + # dictionary with chrname as key, size of the above nparray as value + self.__buf_size = {} + self.__sorted = False + self.total = 0 # total fragments + self.annotation = anno # need to be figured out + self.rlengths = {} + self.buffer_size = buffer_size + self.length = 0 + self.average_template_length = 0.0 + + @cython.ccall + def add_loc(self, chromosome: bytes, + start: cython.int, end: cython.int): + """Add a location to the list according to the sequence name. + + chromosome -- mostly the chromosome name + fiveendpos -- 5' end pos, left for plus strand, right for neg strand + """ + i: cython.int + + if chromosome not in self.__locations: + self.__buf_size[chromosome] = self.buffer_size + # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment. + self.__locations[chromosome] = np.zeros(shape=self.buffer_size, + dtype=[('l', 'i4'), ('r', 'i4')]) + self.__locations[chromosome][0] = (start, end) + self.__size[chromosome] = 1 + else: + i = self.__size[chromosome] + if self.__buf_size[chromosome] == i: + self.__buf_size[chromosome] += self.buffer_size + self.__locations[chromosome].resize((self.__buf_size[chromosome]), + refcheck=False) + self.__locations[chromosome][i] = (start, end) + self.__size[chromosome] = i + 1 + self.length += end - start + return + + @cython.ccall + def destroy(self): + """Destroy this object and release mem. + """ + chrs: set + chromosome: bytes + + chrs = self.get_chr_names() + for chromosome in sorted(chrs): + if chromosome in self.__locations: + self.__locations[chromosome].resize(self.buffer_size, + refcheck=False) + self.__locations[chromosome].resize(0, + refcheck=False) + self.__locations[chromosome] = None + self.__locations.pop(chromosome) + self.__destroyed = True + return + + @cython.ccall + def set_rlengths(self, rlengths: dict) -> bool: + """Set reference chromosome lengths dictionary. + + Only the chromosome existing in this petrack object will be updated. + + If a chromosome in this petrack is not covered by given + rlengths, and it has no associated length, it will be set as + maximum integer. + """ + valid_chroms: set + missed_chroms: set + chrom: bytes + + valid_chroms = set(self.__locations.keys()).intersection(rlengths.keys()) + for chrom in sorted(valid_chroms): + self.rlengths[chrom] = rlengths[chrom] + missed_chroms = set(self.__locations.keys()).difference(rlengths.keys()) + for chrom in sorted(missed_chroms): + self.rlengths[chrom] = INT_MAX + return True + + @cython.ccall + def get_rlengths(self) -> dict: + """Get reference chromosome lengths dictionary. + + If self.rlengths is empty, create a new dict where the length of + chromosome will be set as the maximum integer. + """ + if not self.rlengths: + self.rlengths = dict([(k, INT_MAX) for k in self.__locations.keys()]) + return self.rlengths + + @cython.ccall + def finalize(self): + """ Resize np arrays for 5' positions and sort them in place + + Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read! + """ + c: bytes + chrnames: set + + self.total = 0 + + chrnames = self.get_chr_names() + + for c in chrnames: + self.__locations[c].resize((self.__size[c]), refcheck=False) + self.__locations[c].sort(order=['l', 'r']) + self.total += self.__size[c] + + self.__sorted = True + self.average_template_length = cython.cast(cython.float, self.length) / self.total + return + + @cython.ccall + def get_locations_by_chr(self, chromosome: bytes): + """Return a tuple of two lists of locations for certain chromosome. + + """ + if chromosome in self.__locations: + return self.__locations[chromosome] + else: + raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome)) + + @cython.ccall + def get_chr_names(self) -> set: + """Return all the chromosome names in this track object as a python set. + """ + return set(self.__locations.keys()) + + @cython.ccall + def sort(self): + """Naive sorting for locations. + + """ + c: bytes + chrnames: set + + chrnames = self.get_chr_names() + + for c in chrnames: + self.__locations[c].sort(order=['l', 'r']) # sort by the leftmost location + self.__sorted = True + return + + @cython.ccall + def count_fraglengths(self) -> dict: + """Return a dictionary of the counts for sizes/fragment + lengths of each pair. + + This function is for HMMRATAC. + + """ + sizes: cnp.ndarray(cnp.int32_t, ndim=1) + s: cython.int + locs: cnp.ndarray + chrnames: list + i: cython.int + + counter = Counter() + chrnames = list(self.get_chr_names()) + for i in range(len(chrnames)): + locs = self.__locations[chrnames[i]] + sizes = locs['r'] - locs['l'] + for s in sizes: + counter[s] += 1 + return dict(counter) + + @cython.ccall + def fraglengths(self) -> cnp.ndarray: + """Return the sizes/fragment lengths of each pair. + + This function is for HMMRATAC EM training. + """ + sizes: cnp.ndarray(np.int32_t, ndim=1) + locs: cnp.ndarray + chrnames: list + i: cython.int + + chrnames = list(self.get_chr_names()) + locs = self.__locations[chrnames[0]] + sizes = locs['r'] - locs['l'] + for i in range(1, len(chrnames)): + locs = self.__locations[chrnames[i]] + sizes = np.concatenate((sizes, locs['r'] - locs['l'])) + return sizes + + @cython.boundscheck(False) # do not check that np indices are valid + @cython.ccall + def filter_dup(self, maxnum: cython.int = -1): + """Filter the duplicated reads. + + Run it right after you add all data into this object. + """ + n: cython.int + loc_start: cython.int + loc_end: cython.int + current_loc_start: cython.int + current_loc_end: cython.int + i: cython.ulong + locs_size: cython.ulong + k: bytes + locs: cnp.ndarray + chrnames: set + selected_idx: cnp.ndarray + + if maxnum < 0: + return # condition to return if not filtering + + if not self.__sorted: + self.sort() + + self.total = 0 + # self.length = 0 + self.average_template_length = 0.0 + + chrnames = self.get_chr_names() + + for k in chrnames: # for each chromosome + locs = self.__locations[k] + locs_size = locs.shape[0] + if locs_size == 1: + # do nothing and continue + continue + # discard duplicate reads and make a new __locations[k] + # initialize boolean array as all TRUE, or all being kept + selected_idx = np.ones(locs_size, dtype=bool) + # get the first loc + (current_loc_start, current_loc_end) = locs[0] + i = 1 # index of new_locs + n = 1 # the number of tags in the current genomic location + for i in range(1, locs_size): + (loc_start, loc_end) = locs[i] + if loc_start != current_loc_start or loc_end != current_loc_end: + # not the same, update currnet_loc_start/end/l, reset n + current_loc_start = loc_start + current_loc_end = loc_end + n = 1 + continue + else: + # both ends are the same, add 1 to duplicate number n + n += 1 + if n > maxnum: + # change the flag to False + selected_idx[i] = False + # subtract current_loc_l from self.length + self.length -= current_loc_end - current_loc_start + self.__locations[k] = locs[selected_idx] + self.__size[k] = self.__locations[k].shape[0] + self.total += self.__size[k] + # free memory? + # I know I should shrink it to 0 size directly, + # however, on Mac OSX, it seems directly assigning 0 + # doesn't do a thing. + selected_idx.resize(self.buffer_size, refcheck=False) + selected_idx.resize(0, refcheck=False) + self.average_template_length = self.length / self.total + return + + @cython.ccall + def sample_percent(self, percent: cython.float, seed: cython.int = -1): + """Sample the tags for a given percentage. + + Warning: the current object is changed! If a new PETrackI is + wanted, use sample_percent_copy instead. + + """ + # num: number of reads allowed on a certain chromosome + num: cython.uint + k: bytes + chrnames: set + + self.total = 0 + self.length = 0 + self.average_template_length = 0.0 + + chrnames = self.get_chr_names() + + if seed >= 0: + info(f"# A random seed {seed} has been used") + rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed))) + rs_shuffle = rs.shuffle + else: + rs_shuffle = np.random.shuffle + + for k in sorted(chrnames): + # for each chromosome. + # This loop body is too big, I may need to split code later... + + num = cython.cast(cython.uint, + round(self.__locations[k].shape[0] * percent, 5)) + rs_shuffle(self.__locations[k]) + self.__locations[k].resize(num, refcheck=False) + self.__locations[k].sort(order=['l', 'r']) # sort by leftmost positions + self.__size[k] = self.__locations[k].shape[0] + self.length += (self.__locations[k]['r'] - self.__locations[k]['l']).sum() + self.total += self.__size[k] + self.average_template_length = cython.cast(cython.float, self.length)/self.total + return + + @cython.ccall + def sample_percent_copy(self, percent: cython.float, seed: cython.int = -1): + """Sample the tags for a given percentage. Return a new PETrackI object + + """ + # num: number of reads allowed on a certain chromosome + num: cython.uint + k: bytes + chrnames: set + ret_petrackI: PETrackI + loc: cnp.ndarray + + ret_petrackI = PETrackI(anno=self.annotation, buffer_size=self.buffer_size) + chrnames = self.get_chr_names() + + if seed >= 0: + info(f"# A random seed {seed} has been used in the sampling function") + rs = np.random.default_rng(seed) + else: + rs = np.random.default_rng() + + rs_shuffle = rs.shuffle + + # chrnames need to be sorted otherwise we can't assure reproducibility + for k in sorted(chrnames): + # for each chromosome. + # This loop body is too big, I may need to split code later... + loc = np.copy(self.__locations[k]) + num = cython.cast(cython.uint, round(loc.shape[0] * percent, 5)) + rs_shuffle(loc) + loc.resize(num, refcheck=False) + loc.sort(order=['l', 'r']) # sort by leftmost positions + ret_petrackI.__locations[k] = loc + ret_petrackI.__size[k] = loc.shape[0] + ret_petrackI.length += (loc['r'] - loc['l']).sum() + ret_petrackI.total += ret_petrackI.__size[k] + ret_petrackI.average_template_length = cython.cast(cython.float, ret_petrackI.length)/ret_petrackI.total + ret_petrackI.set_rlengths(self.get_rlengths()) + return ret_petrackI + + @cython.ccall + def sample_num(self, samplesize: cython.ulong, seed: cython.int = -1): + """Sample the tags for a given number. + + Warning: the current object is changed! + """ + percent: cython.float + + percent = cython.cast(cython.float, samplesize)/self.total + self.sample_percent(percent, seed) + return + + @cython.ccall + def sample_num_copy(self, samplesize: cython.ulong, seed: cython.int = -1): + """Sample the tags for a given number. + + Warning: the current object is changed! + """ + percent: cython.float + + percent = cython.cast(cython.float, samplesize)/self.total + return self.sample_percent_copy(percent, seed) + + @cython.ccall + def print_to_bed(self, fhd=None): + """Output to BEDPE format files. If fhd is given, write to a + file, otherwise, output to standard output. + + """ + i: cython.int + s: cython.int + e: cython.int + k: bytes + chrnames: set + + if not fhd: + fhd = sys.stdout + assert isinstance(fhd, io.IOBase) + + chrnames = self.get_chr_names() + + for k in chrnames: + # for each chromosome. + # This loop body is too big, I may need to split code later... + + locs = self.__locations[k] + + for i in range(locs.shape[0]): + s, e = locs[i] + fhd.write("%s\t%d\t%d\n" % (k.decode(), s, e)) + return + + @cython.ccall + def pileup_a_chromosome(self, + chrom: bytes, + scale_factor_s: list, + baseline_value: cython.float = 0.0) -> list: + """pileup a certain chromosome, return [p,v] (end position and + value) list. + + scale_factor_s : linearly scale the pileup value applied to + each d in ds. The list should have the same + length as ds. + + baseline_value : a value to be filled for missing values, and + will be the minimum pileup. + + """ + tmp_pileup: list + prev_pileup: list + scale_factor: cython.float + + prev_pileup = None + + for i in range(len(scale_factor_s)): + scale_factor = scale_factor_s[i] + + # Can't directly pass partial nparray there since that will mess up with pointer calculation. + tmp_pileup = quick_pileup(np.sort(self.__locations[chrom]['l']), + np.sort(self.__locations[chrom]['r']), + scale_factor, baseline_value) + + if prev_pileup: + prev_pileup = over_two_pv_array(prev_pileup, + tmp_pileup, + func="max") + else: + prev_pileup = tmp_pileup + + return prev_pileup + + @cython.ccall + def pileup_a_chromosome_c(self, + chrom: bytes, + ds: list, + scale_factor_s: list, + baseline_value: cython.float = 0.0) -> list: + """pileup a certain chromosome, return [p,v] (end position and + value) list. + + This function is for control track. Basically, here is a + simplified function from FixWidthTrack. We pretend the PE is + SE data and left read is on plus strand and right read is on + minus strand. + + ds : tag will be extended to this value to 3' direction, + unless directional is False. Can contain multiple + extension values. Final pileup will the maximum. + scale_factor_s : linearly scale the pileup value applied to + each d in ds. The list should have the same + length as ds. + baseline_value : a value to be filled for missing values, and + will be the minimum pileup. + """ + tmp_pileup: list + prev_pileup: list + scale_factor: cython.float + d: cython.long + five_shift: cython.long + three_shift: cython.long + rlength: cython.long = self.get_rlengths()[chrom] + + if not self.__sorted: + self.sort() + + assert len(ds) == len(scale_factor_s), "ds and scale_factor_s must have the same length!" + + prev_pileup = None + + for i in range(len(scale_factor_s)): + d = ds[i] + scale_factor = scale_factor_s[i] + five_shift = d//2 + three_shift = d//2 + + tmp_pileup = se_all_in_one_pileup(self.__locations[chrom]['l'], + self.__locations[chrom]['r'], + five_shift, + three_shift, + rlength, + scale_factor, + baseline_value) + + if prev_pileup: + prev_pileup = over_two_pv_array(prev_pileup, + tmp_pileup, + func="max") + else: + prev_pileup = tmp_pileup + + return prev_pileup + + @cython.ccall + def pileup_bdg(self, + scale_factor_s: list, + baseline_value: cython.float = 0.0): + """pileup all chromosomes, and return a bedGraphTrackI object. + + scale_factor_s : linearly scale the pileup value applied to + each d in ds. The list should have the same + length as ds. + + baseline_value : a value to be filled for missing values, and + will be the minimum pileup. + + """ + tmp_pileup: list + prev_pileup: list + scale_factor: cython.float + chrom: bytes + bdg: bedGraphTrackI + + bdg = bedGraphTrackI(baseline_value=baseline_value) + + for chrom in sorted(self.get_chr_names()): + prev_pileup = None + for i in range(len(scale_factor_s)): + scale_factor = scale_factor_s[i] + + # Can't directly pass partial nparray there since that + # will mess up with pointer calculation. + tmp_pileup = quick_pileup(np.sort(self.__locations[chrom]['l']), + np.sort(self.__locations[chrom]['r']), + scale_factor, + baseline_value) + + if prev_pileup: + prev_pileup = over_two_pv_array(prev_pileup, + tmp_pileup, + func="max") + else: + prev_pileup = tmp_pileup + # save to bedGraph + bdg.add_chrom_data(chrom, + pyarray('i', prev_pileup[0]), + pyarray('f', prev_pileup[1])) + return bdg + + @cython.ccall + def pileup_bdg_hmmr(self, + mapping: list, + baseline_value: cython.float = 0.0) -> list: + """pileup all chromosomes, and return a list of four + bedGraphTrackI objects: short, mono, di, and tri nucleosomal + signals. + + The idea is that for each fragment length, we generate four + bdg using four weights from four distributions. Then we add + all sets of four bdgs together. + + Way to generate 'mapping', based on HMMR EM means and stddevs: + fl_dict = petrack.count_fraglengths() + fl_list = list(fl_dict.keys()) + fl_list.sort() + weight_mapping = generate_weight_mapping(fl_list, em_means, em_stddevs) + + """ + ret_pileup: list + chroms: set + chrom: bytes + i: cython.int + + ret_pileup = [] + for i in range(len(mapping)): + ret_pileup.append({}) + chroms = self.get_chr_names() + for i in range(len(mapping)): + for chrom in sorted(chroms): + ret_pileup[i][chrom] = pileup_from_LR_hmmratac(self.__locations[chrom], mapping[i]) + return ret_pileup + + +@cython.cclass +class PEtrackII(PETrackI): + """Documentation for PEtrac + + """ + # add another dict for storing barcode for each fragment + __barcode = cython.declare(dict, visibility="public") + __barcode_dict = cython.declare(dict, visibility="public") + # add another dict for storing counts for each fragment + __counts = cython.declare(dict, visibility="public") + + def __init__(self, args): + super(PETrackI, self).__init__() + self.__barcodes = {} + self.__barcode_dict = {} + + @cython.ccall + def add_frag(self, + chromosome: bytes, + start: cython.int, + end: cython.int, + barcode: bytes, + count: cython.uchar): + """Add a location to the list according to the sequence name. + + chromosome: mostly the chromosome name + start: left position of the fragment + end: right position of the fragment + barcode: the barcode of the fragment + count: the count of the fragment + """ + i: cython.int + h: cython.long + + h = hash(barcode) + self.__barcode_dict[h] = barcode + + if chromosome not in self.__locations: + self.__buf_size[chromosome] = self.buffer_size + # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment. + self.__locations[chromosome] = np.zeros(shape=self.buffer_size, + dtype=[('l', 'i4'), ('r', 'i4'), ('c', 'u1')]) + self.__barcodes[chromosome] = np.zeros(shape=self.buffer_size, + dtype='i4') + self.__locations[chromosome][0] = (start, end, count) + self.__barcodes[chromosome][0] = h + self.__size[chromosome] = 1 + else: + i = self.__size[chromosome] + if self.__buf_size[chromosome] == i: + self.__buf_size[chromosome] += self.buffer_size + self.__locations[chromosome].resize((self.__buf_size[chromosome]), + refcheck=False) + self.__locations[chromosome][i] = (start, end, count) + self.__barcodes[chromosome][i] = h + self.__size[chromosome] = i + 1 + self.length += end - start + return + + @cython.ccall + def destroy(self): + """Destroy this object and release mem. + """ + chrs: set + chromosome: bytes + + chrs = self.get_chr_names() + for chromosome in sorted(chrs): + if chromosome in self.__locations: + self.__locations[chromosome].resize(self.buffer_size, + refcheck=False) + self.__locations[chromosome].resize(0, + refcheck=False) + self.__locations[chromosome] = None + self.__locations.pop(chromosome) + self.__barcodes.resize(self.buffer_size, + refcheck=False) + self.__barcodes.resize(0, + refcheck=False) + self.__barcodes[chromosome] = None + self.__barcodes.pop(chromosome) + self.__barcode_dict = {} + self.__destroyed = True + return diff --git a/MACS3/Signal/PairedEndTrack.pyx b/MACS3/Signal/PairedEndTrack.pyx deleted file mode 100644 index 808f5d1c..00000000 --- a/MACS3/Signal/PairedEndTrack.pyx +++ /dev/null @@ -1,584 +0,0 @@ -# cython: language_level=3 -# cython: profile=True -# Time-stamp: <2022-09-15 17:07:26 Tao Liu> - -"""Module for filter duplicate tags from paired-end data - -This code is free software; you can redistribute it and/or modify it -under the terms of the BSD License (see the file LICENSE included with -the distribution). -""" - -# ------------------------------------ -# Python modules -# ------------------------------------ -import io -import sys -from copy import copy -from array import array as pyarray -from collections import Counter - -import logging -import MACS3.Utilities.Logger - -logger = logging.getLogger(__name__) -debug = logger.debug -info = logger.info -# ------------------------------------ -# MACS3 modules -# ------------------------------------ -from MACS3.Utilities.Constants import * -from MACS3.Signal.Pileup import quick_pileup, over_two_pv_array, se_all_in_one_pileup -from MACS3.Signal.BedGraph import bedGraphTrackI -from MACS3.Signal.PileupV2 import pileup_from_LR_hmmratac -# ------------------------------------ -# Other modules -# ------------------------------------ -import numpy as np -cimport numpy as np -from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t -from cpython cimport bool -cimport cython - - -cdef INT_MAX = (((-1))>>1) - -# We don't use the following structs anymore -# cdef packed struct peLoc: -# int32_t l -# int32_t r - -# cdef class PETrackChromosome: -# cdef: -# public np.ndarray locations -# public uint32_t pointer -# public uint32_t buffer_size -# public uint64_t coverage -# public uint64_t chrlen -# uint32_t __buffer_increment -# bool __sorted -# bool __destroyed - -# Let numpy enforce PE-ness using ndarray, gives bonus speedup when sorting -# PE data doesn't have strandedness - -cdef class PETrackI: - """Paired End Locations Track class I along the whole genome - (commonly with the same annotation type), which are stored in a - dict. - - Locations are stored and organized by sequence names (chr names) in a - dict. They can be sorted by calling self.sort() function. - """ - cdef: - public dict __locations - public dict __size - public dict __buf_size - public bool __sorted - public uint64_t total - public object annotation - public dict rlengths - public int64_t buffer_size - public int64_t length - public float32_t average_template_length - bool __destroyed - - def __init__ (self, char * anno="", int64_t buffer_size = 100000 ): - """fw is the fixed-width for all locations. - - """ - self.__locations = {} # dictionary with chrname as key, nparray with [('l','int32'),('r','int32')] as value - self.__size = {} # dictionary with chrname as key, size of the above nparray as value - self.__buf_size = {} # dictionary with chrname as key, size of the above nparray as value - self.__sorted = False - self.total = 0 # total fragments - self.annotation = anno # need to be figured out - self.rlengths = {} - self.buffer_size = buffer_size - self.length = 0 - self.average_template_length = 0.0 - - cpdef void add_loc ( self, bytes chromosome, int32_t start, int32_t end): - """Add a location to the list according to the sequence name. - - chromosome -- mostly the chromosome name - fiveendpos -- 5' end pos, left for plus strand, right for neg strand - """ - cdef: - int32_t i - - if chromosome not in self.__locations: - self.__buf_size[chromosome] = self.buffer_size - self.__locations[chromosome] = np.zeros(shape=self.buffer_size, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment. - self.__locations[chromosome][0] = ( start, end ) - self.__size[chromosome] = 1 - else: - i = self.__size[chromosome] - if self.__buf_size[chromosome] == i: - self.__buf_size[chromosome] += self.buffer_size - self.__locations[chromosome].resize((self.__buf_size[chromosome]), refcheck = False ) - self.__locations[chromosome][ i ] = ( start, end ) - self.__size[chromosome] = i + 1 - self.length += end - start - return - - cpdef void destroy ( self ): - """Destroy this object and release mem. - """ - cdef: - set chrs - bytes chromosome - - chrs = self.get_chr_names() - for chromosome in sorted(chrs): - if chromosome in self.__locations: - self.__locations[chromosome].resize( self.buffer_size, refcheck=False ) - self.__locations[chromosome].resize( 0, refcheck=False ) - self.__locations[chromosome] = None - self.__locations.pop(chromosome) - self.__destroyed = True - return - - cpdef bint set_rlengths ( self, dict rlengths ): - """Set reference chromosome lengths dictionary. - - Only the chromosome existing in this petrack object will be updated. - - If a chromosome in this petrack is not covered by given - rlengths, and it has no associated length, it will be set as - maximum integer. - """ - cdef: - set valid_chroms, missed_chroms - bytes chrom - - valid_chroms = set(self.__locations.keys()).intersection(rlengths.keys()) - for chrom in sorted(valid_chroms): - self.rlengths[chrom] = rlengths[chrom] - missed_chroms = set(self.__locations.keys()).difference(rlengths.keys()) - for chrom in sorted(missed_chroms): - self.rlengths[chrom] = INT_MAX - return True - - cpdef dict get_rlengths ( self ): - """Get reference chromosome lengths dictionary. - - If self.rlengths is empty, create a new dict where the length of - chromosome will be set as the maximum integer. - """ - if not self.rlengths: - self.rlengths = dict([(k, INT_MAX) for k in self.__locations.keys()]) - return self.rlengths - - cpdef void finalize ( self ): - """ Resize np arrays for 5' positions and sort them in place - - Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read! - """ - - cdef: - int32_t i - bytes c - set chrnames - - self.total = 0 - - chrnames = self.get_chr_names() - - for c in chrnames: - self.__locations[c].resize((self.__size[c]), refcheck=False) - self.__locations[c].sort( order=['l', 'r'] ) - self.total += self.__size[c] - - self.__sorted = True - self.average_template_length = ( self.length ) / self.total - return - - cpdef get_locations_by_chr ( self, bytes chromosome ): - """Return a tuple of two lists of locations for certain chromosome. - - """ - if chromosome in self.__locations: - return self.__locations[chromosome] - else: - raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome)) - - cpdef set get_chr_names ( self ): - """Return all the chromosome names in this track object as a python set. - """ - return set(self.__locations.keys()) - - - cpdef void sort ( self ): - """Naive sorting for locations. - - """ - cdef: - uint32_t i - bytes c - set chrnames - - chrnames = self.get_chr_names() - - for c in chrnames: - #print "before", self.__locations[c][0:100] - self.__locations[c].sort( order=['l', 'r'] ) # sort by the leftmost location - #print "before", self.__locations[c][0:100] - self.__sorted = True - return - - cpdef dict count_fraglengths ( self ): - """Return a dictionary of the counts for sizes/fragment lengths of each pair. - - This function is for HMMRATAC. - """ - cdef: - np.ndarray[np.int32_t, ndim=1] sizes - np.int32_t s - np.ndarray locs - list chrnames - int i - #dict ret_dict - bytes k - - counter = Counter() - chrnames = list( self.get_chr_names() ) - for i in range( len(chrnames) ): - locs = self.__locations[ chrnames[i] ] - sizes = locs['r'] - locs['l'] - for s in sizes: - counter[ s ] += 1 - return dict(counter) - - cpdef np.ndarray fraglengths ( self ): - """Return the sizes/fragment lengths of each pair. - - This function is for HMMRATAC EM training. - """ - cdef: - np.ndarray[np.int32_t, ndim=1] sizes - np.ndarray locs - list chrnames - int i - - chrnames = list( self.get_chr_names() ) - locs = self.__locations[ chrnames[ 0 ] ] - sizes = locs['r'] - locs['l'] - for i in range( 1, len(chrnames) ): - locs = self.__locations[ chrnames[i] ] - sizes = np.concatenate( ( sizes, locs['r'] - locs['l'] ) ) - return sizes - - @cython.boundscheck(False) # do not check that np indices are valid - cpdef void filter_dup ( self, int32_t maxnum=-1): - """Filter the duplicated reads. - - Run it right after you add all data into this object. - """ - cdef: - int32_t i_chrom, n, start, end - int32_t loc_start, loc_end, current_loc_start, current_loc_end - uint64_t i - bytes k - np.ndarray locs - uint64_t locs_size - set chrnames - np.ndarray selected_idx - - if maxnum < 0: return # condition to return if not filtering - - if not self.__sorted: self.sort() - - self.total = 0 - #self.length = 0 - self.average_template_length = 0.0 - - chrnames = self.get_chr_names() - - for k in chrnames: # for each chromosome - locs = self.__locations[k] - locs_size = locs.shape[0] - if locs_size == 1: - # do nothing and continue - continue - # discard duplicate reads and make a new __locations[k] - # initialize boolean array as all TRUE, or all being kept - selected_idx = np.ones( locs_size, dtype=bool) - # get the first loc - ( current_loc_start, current_loc_end ) = locs[0] - i = 1 # index of new_locs - n = 1 # the number of tags in the current genomic location - for i in range(1, locs_size): - ( loc_start, loc_end ) = locs[i] - if loc_start != current_loc_start or loc_end != current_loc_end: - # not the same, update currnet_loc_start/end/l, reset n - current_loc_start = loc_start - current_loc_end = loc_end - n = 1 - continue - else: - # both ends are the same, add 1 to duplicate number n - n += 1 - if n > maxnum: - # change the flag to False - selected_idx[ i ] = False - # subtract current_loc_l from self.length - self.length -= current_loc_end - current_loc_start - self.__locations[k] = locs[ selected_idx ] - self.__size[k] = self.__locations[k].shape[0] - self.total += self.__size[k] - # free memory? - # I know I should shrink it to 0 size directly, - # however, on Mac OSX, it seems directly assigning 0 - # doesn't do a thing. - selected_idx.resize( self.buffer_size, refcheck=False) - selected_idx.resize( 0, refcheck=False) - self.average_template_length = self.length / self.total - return - - cpdef void sample_percent (self, float32_t percent, int32_t seed = -1): - """Sample the tags for a given percentage. - - Warning: the current object is changed! If a new PETrackI is wanted, use sample_percent_copy instead. - """ - cdef: - uint32_t num, i_chrom # num: number of reads allowed on a certain chromosome - bytes k - set chrnames - object rs, rs_shuffle - - self.total = 0 - self.length = 0 - self.average_template_length = 0.0 - - chrnames = self.get_chr_names() - - if seed >= 0: - info(f"# A random seed {seed} has been used") - rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed))) - rs_shuffle = rs.shuffle - else: - rs_shuffle = np.random.shuffle - - for k in sorted(chrnames): - # for each chromosome. - # This loop body is too big, I may need to split code later... - - num = round(self.__locations[k].shape[0] * percent, 5 ) - rs_shuffle( self.__locations[k] ) - self.__locations[k].resize( num, refcheck = False ) - self.__locations[k].sort( order = ['l', 'r'] ) # sort by leftmost positions - self.__size[k] = self.__locations[k].shape[0] - self.length += ( self.__locations[k]['r'] - self.__locations[k]['l'] ).sum() - self.total += self.__size[k] - self.average_template_length = ( self.length )/ self.total - return - - cpdef object sample_percent_copy (self, float32_t percent, int32_t seed = -1): - """Sample the tags for a given percentage. Return a new PETrackI object - - """ - cdef: - uint32_t num, i_chrom # num: number of reads allowed on a certain chromosome - bytes k - set chrnames - object ret_petrackI, rs, rs_shuffle - np.ndarray l - - ret_petrackI = PETrackI( anno=self.annotation, buffer_size = self.buffer_size) - chrnames = self.get_chr_names() - - if seed >= 0: - info(f"# A random seed {seed} has been used in the sampling function") - rs = np.random.default_rng(seed) - else: - rs = np.random.default_rng() - - rs_shuffle = rs.shuffle - for k in sorted(chrnames): # chrnames need to be sorted otherwise we can't assure reproducibility - # for each chromosome. - # This loop body is too big, I may need to split code later... - l = np.copy( self.__locations[k] ) - num = round(l.shape[0] * percent, 5 ) - rs_shuffle( l ) - l.resize( num, refcheck = False ) - l.sort( order = ['l', 'r'] ) # sort by leftmost positions - ret_petrackI.__locations[ k ] = l - ret_petrackI.__size[ k ] = l.shape[0] - ret_petrackI.length += ( l['r'] - l['l'] ).sum() - ret_petrackI.total += ret_petrackI.__size[ k ] - ret_petrackI.average_template_length = ( ret_petrackI.length )/ ret_petrackI.total - ret_petrackI.set_rlengths( self.get_rlengths() ) - return ret_petrackI - - cpdef void sample_num (self, uint64_t samplesize, int32_t seed = -1): - """Sample the tags for a given number. - - Warning: the current object is changed! - """ - cdef: - float32_t percent - percent = (samplesize)/self.total - self.sample_percent ( percent, seed ) - return - - cpdef object sample_num_copy (self, uint64_t samplesize, int32_t seed = -1): - """Sample the tags for a given number. - - Warning: the current object is changed! - """ - cdef: - float32_t percent - percent = (samplesize)/self.total - return self.sample_percent_copy ( percent, seed ) - - cpdef void print_to_bed (self, fhd=None): - """Output to BEDPE format files. If fhd is given, write to a - file, otherwise, output to standard output. - - """ - cdef: - int32_t i, i_chrom, s, e - bytes k - set chrnames - - - if not fhd: - fhd = sys.stdout - assert isinstance(fhd, io.IOBase) - - chrnames = self.get_chr_names() - - for k in chrnames: - # for each chromosome. - # This loop body is too big, I may need to split code later... - - locs = self.__locations[k] - - for i in range(locs.shape[0]): - s, e = locs[ i ] - fhd.write("%s\t%d\t%d\n" % (k.decode(), s, e)) - return - - cpdef list pileup_a_chromosome ( self, bytes chrom, list scale_factor_s, float32_t baseline_value = 0.0 ): - """pileup a certain chromosome, return [p,v] (end position and value) list. - - scale_factor_s : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds. - baseline_value : a value to be filled for missing values, and will be the minimum pileup. - """ - cdef: - list tmp_pileup, prev_pileup - float32_t scale_factor - - prev_pileup = None - - for i in range(len(scale_factor_s)): - scale_factor = scale_factor_s[i] - - tmp_pileup = quick_pileup ( np.sort(self.__locations[chrom]['l']), np.sort(self.__locations[chrom]['r']), scale_factor, baseline_value ) # Can't directly pass partial nparray there since that will mess up with pointer calculation. - - if prev_pileup: - prev_pileup = over_two_pv_array ( prev_pileup, tmp_pileup, func="max" ) - else: - prev_pileup = tmp_pileup - - return prev_pileup - - cpdef list pileup_a_chromosome_c ( self, bytes chrom, list ds, list scale_factor_s, float32_t baseline_value = 0.0 ): - """pileup a certain chromosome, return [p,v] (end position and value) list. - - This function is for control track. Basically, here is a - simplified function from FixWidthTrack. We pretend the PE is - SE data and left read is on plus strand and right read is on - minus strand. - - ds : tag will be extended to this value to 3' direction, - unless directional is False. Can contain multiple extension - values. Final pileup will the maximum. - scale_factor_s : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds. - baseline_value : a value to be filled for missing values, and will be the minimum pileup. - """ - cdef: - list tmp_pileup, prev_pileup - float32_t scale_factor - int64_t d, five_shift, three_shift - int64_t rlength = self.get_rlengths()[chrom] - - if not self.__sorted: self.sort() - - assert len(ds) == len(scale_factor_s), "ds and scale_factor_s must have the same length!" - - prev_pileup = None - - for i in range(len(scale_factor_s)): - d = ds[i] - scale_factor = scale_factor_s[i] - five_shift = d//2 - three_shift= d//2 - - tmp_pileup = se_all_in_one_pileup ( self.__locations[chrom]['l'], self.__locations[chrom]['r'], five_shift, three_shift, rlength, scale_factor, baseline_value ) - - if prev_pileup: - prev_pileup = over_two_pv_array ( prev_pileup, tmp_pileup, func="max" ) - else: - prev_pileup = tmp_pileup - - return prev_pileup - - - cpdef object pileup_bdg ( self, list scale_factor_s, float32_t baseline_value = 0.0 ): - """pileup all chromosomes, and return a bedGraphTrackI object. - - scale_factor_s : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds. - baseline_value : a value to be filled for missing values, and will be the minimum pileup. - """ - cdef: - list tmp_pileup, prev_pileup - float32_t scale_factor - bytes chrom - object bdg - int32_t prev_s - - #info(f"start to pileup") - bdg = bedGraphTrackI( baseline_value = baseline_value ) - - for chrom in sorted(self.get_chr_names()): - prev_pileup = None - for i in range(len(scale_factor_s)): - scale_factor = scale_factor_s[i] - - tmp_pileup = quick_pileup ( np.sort(self.__locations[chrom]['l']), np.sort(self.__locations[chrom]['r']), scale_factor, baseline_value ) # Can't directly pass partial nparray there since that will mess up with pointer calculation. - - if prev_pileup: - prev_pileup = over_two_pv_array ( prev_pileup, tmp_pileup, func="max" ) - else: - prev_pileup = tmp_pileup - # save to bedGraph - bdg.add_chrom_data( chrom, pyarray('i', prev_pileup[0]), pyarray('f', prev_pileup[1]) ) - return bdg - - cpdef list pileup_bdg_hmmr ( self, list mapping, float32_t baseline_value = 0.0 ): - """pileup all chromosomes, and return a list of four bedGraphTrackI objects: short, mono, di, and tri nucleosomal signals. - - The idea is that for each fragment length, we generate four bdg using four weights from four distributions. Then we add all sets of four bdgs together. - - Way to generate 'mapping', based on HMMR EM means and stddevs: - fl_dict = petrack.count_fraglengths() - fl_list = list(fl_dict.keys()) - fl_list.sort() - weight_mapping = generate_weight_mapping( fl_list, em_means, em_stddevs ) - """ - cdef: - list ret_pileup - set chroms - bytes chrom - int i - - ret_pileup = [] - for i in range( len(mapping) ): ret_pileup.append( {} ) - chroms = self.get_chr_names() - for i in range( len(mapping) ): - for chrom in sorted(chroms): - ret_pileup[ i ][ chrom ] = pileup_from_LR_hmmratac( self.__locations[ chrom ], mapping[ i ] ) - return ret_pileup - diff --git a/MACS3/Signal/ScoreTrack.pyx b/MACS3/Signal/ScoreTrack.pyx index 0426b18a..1ef3d31b 100644 --- a/MACS3/Signal/ScoreTrack.pyx +++ b/MACS3/Signal/ScoreTrack.pyx @@ -1,6 +1,6 @@ # cython: language_level=3 # cython: profile=True -# Time-stamp: <2024-05-14 12:06:19 Tao Liu> +# Time-stamp: <2024-10-10 16:45:13 Tao Liu> """Module for Feature IO classes. @@ -20,7 +20,7 @@ from functools import reduce # ------------------------------------ from MACS3.Signal.SignalProcessing import maxima, enforce_valleys, enforce_peakyness from MACS3.Signal.Prob import poisson_cdf -from MACS3.IO.PeakIO import PeakIO, BroadPeakIO, parse_peakname +from MACS3.IO.PeakIO import PeakIO, BroadPeakIO # ------------------------------------ # Other modules diff --git a/setup.py b/setup.py index a36e558b..65d78062 100644 --- a/setup.py +++ b/setup.py @@ -120,7 +120,7 @@ def main(): include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), Extension("MACS3.Signal.PairedEndTrack", - ["MACS3/Signal/PairedEndTrack.pyx"], + ["MACS3/Signal/PairedEndTrack.py"], include_dirs=numpy_include_dir, extra_compile_args=extra_c_args), Extension("MACS3.Signal.BedGraph", @@ -188,7 +188,7 @@ def main(): ["MACS3/IO/Parser.py"], extra_compile_args=extra_c_args), Extension("MACS3.IO.PeakIO", - ["MACS3/IO/PeakIO.pyx"], + ["MACS3/IO/PeakIO.py"], extra_compile_args=extra_c_args), Extension("MACS3.IO.BedGraphIO", ["MACS3/IO/BedGraphIO.py"],