From 978febcf9cc13eaebdbc9e9366855dff282c19a0 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Sun, 6 Dec 2020 23:25:39 -0500 Subject: [PATCH 1/6] use buffer to read input file every 10M bytes --- MACS3/IO/Parser.pyx | 205 +++++++++++++++++++++-------------- MACS3/Utilities/Constants.py | 3 +- 2 files changed, 128 insertions(+), 80 deletions(-) diff --git a/MACS3/IO/Parser.pyx b/MACS3/IO/Parser.pyx index 5dfc4f59..5e844659 100644 --- a/MACS3/IO/Parser.pyx +++ b/MACS3/IO/Parser.pyx @@ -1,7 +1,7 @@ # cython: language_level=3 # cython: profile=True # cython: linetrace=True -# Time-stamp: <2020-12-03 11:48:43 Tao Liu> +# Time-stamp: <2020-12-06 23:24:53 Tao Liu> """Module for all MACS Parser classes for input. @@ -87,7 +87,7 @@ cpdef guess_parser ( fname, int64_t buffer_size = 100000 ): t_parser.close() raise Exception( "Can't detect format!" ) -cdef tuple __fw_binary_parse_le ( const unsigned char * data ): +cdef tuple __bam_fw_binary_parse_le ( const unsigned char * data ): """Parse a BAM SE entry in little endian system """ cdef: @@ -135,7 +135,7 @@ cdef tuple __fw_binary_parse_le ( const unsigned char * data ): return ( thisref, thisstart, thisstrand ) -cdef tuple __fw_binary_parse_be ( const unsigned char * data ): +cdef tuple __bam_fw_binary_parse_be ( const unsigned char * data ): """Big endian version. We need byte swap. """ cdef: @@ -191,7 +191,7 @@ cdef tuple __fw_binary_parse_be ( const unsigned char * data ): return ( thisref, thisstart, thisstrand ) -cdef tuple __pe_binary_parse_le (const unsigned char * data): +cdef tuple __bampe_pe_binary_parse_le (const unsigned char * data): """Parse a BAMPE record in little-endian system. """ cdef: @@ -234,7 +234,7 @@ cdef tuple __pe_binary_parse_le (const unsigned char * data): return ( thisref, thisstart, thistlen ) -cdef tuple __pe_binary_parse_be (const unsigned char * data): +cdef tuple __bampe_pe_binary_parse_be (const unsigned char * data): """Parse a BAMPE record in big-endian system. And we need byte swap. """ cdef: @@ -281,11 +281,11 @@ cdef tuple __pe_binary_parse_be (const unsigned char * data): # choose a parser according to endian if is_le: - se_entry_parser = __fw_binary_parse_le - pe_entry_parser = __pe_binary_parse_le + bam_se_entry_parser = __bam_fw_binary_parse_le + bampe_pe_entry_parser = __bampe_pe_binary_parse_le else: - se_entry_parser = __fw_binary_parse_be - pe_entry_parser = __pe_binary_parse_be + bam_se_entry_parser = __bam_fw_binary_parse_be + bampe_pe_entry_parser = __bampe_pe_binary_parse_be # ------------------------------------ # Classes @@ -343,7 +343,7 @@ cdef class GenericParser: f.close() if self.gzipped: # open with gzip.open, then wrap it with BufferedReader! - self.fhd = io.BufferedReader( gzip.open( filename, mode='rb' ), buffer_size = 1048576 ) # buffersize set to 1M + self.fhd = io.BufferedReader( gzip.open( filename, mode='rb' ), buffer_size = READ_BUFFER_SIZE ) # buffersize set to 10M else: self.fhd = io.open( filename, mode='rb' ) # binary mode! I don't expect unicode here! self.__skip_first_commentlines() @@ -403,26 +403,36 @@ cdef class GenericParser: * BAMParser for binary BAM format should have a different one. """ cdef: - int64_t i, m, fpos, strand + int64_t i, fpos, strand bytes chromosome + bytes tmp fwtrack = FWTrack( buffer_size = self.buffer_size ) i = 0 - m = 0 - for thisline in self.fhd: - ( chromosome, fpos, strand ) = self.__fw_parse_line( thisline ) - i+=1 - if fpos < 0 or not chromosome: - # normally __fw_parse_line will return -1 if the line - # contains no successful alignment. - continue - if i % 1000000 == 0: - info( " %d" % i ) - fwtrack.add_loc( chromosome, fpos, strand ) - - # close fwtrack and sort - # fwtrack.finalize() - # this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files. + tmp = b"" + while True: + # for each block of input + tmp += self.fhd.read( READ_BUFFER_SIZE ) + if not tmp: + break + lines = tmp.split(b"\n") + tmp = lines[ -1 ] + for thisline in lines[ :-1 ]: + ( chromosome, fpos, strand ) = self.__fw_parse_line( thisline ) + if fpos < 0 or not chromosome: + # normally __fw_parse_line will return -1 if the line + # contains no successful alignment. + continue + i += 1 + if i % 1000000 == 0: + info( " %d reads parsed" % i ) + fwtrack.add_loc( chromosome, fpos, strand ) + # last one + if tmp: + ( chromosome, fpos, strand ) = self.__fw_parse_line( tmp ) + if fpos >= 0 and chromosome: + i += 1 + fwtrack.add_loc( chromosome, fpos, strand ) # close file stream. self.close() return fwtrack @@ -431,22 +441,37 @@ cdef class GenericParser: """Add more records to an existing FWTrack object. """ + cdef: + int64_t i, fpos, strand + bytes chromosome + bytes tmp i = 0 - m = 0 - for thisline in self.fhd: - ( chromosome, fpos, strand ) = self.__fw_parse_line( thisline ) - i+=1 - if fpos < 0 or not chromosome: - # normally __fw_parse_line will return -1 if the line - # contains no successful alignment. - continue - if i % 1000000 == 0: - info( " %d" % i ) - fwtrack.add_loc( chromosome, fpos, strand ) - - # close fwtrack and sort - #fwtrack.finalize() - # this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files. + tmp = "b" + while True: + # for each block of input + tmp += self.fhd.read( READ_BUFFER_SIZE ) + if not tmp: + break + lines = tmp.split(b"\n") + tmp = lines[ -1 ] + for thisline in lines[ :-1 ]: + ( chromosome, fpos, strand ) = self.__fw_parse_line( thisline ) + if fpos < 0 or not chromosome: + # normally __fw_parse_line will return -1 if the line + # contains no successful alignment. + continue + i += 1 + if i % 1000000 == 0: + info( " %d reads parsed" % i ) + fwtrack.add_loc( chromosome, fpos, strand ) + + # last one + if tmp: + ( chromosome, fpos, strand ) = self.__fw_parse_line( tmp ) + if fpos >= 0 and chromosome: + i += 1 + fwtrack.add_loc( chromosome, fpos, strand ) + # close file stream. self.close() return fwtrack @@ -612,27 +637,39 @@ cdef class BEDPEParser(GenericParser): int32_t right_pos int64_t i = 0 # number of fragments int64_t m = 0 # sum of fragment lengths + bytes tmp = b"" petrack = PETrackI( buffer_size = self.buffer_size ) add_loc = petrack.add_loc - for thisline in self.fhd: + while True: + # for each block of input + tmp += self.fhd.read( READ_BUFFER_SIZE ) + if not tmp: + break + lines = tmp.split(b"\n") + tmp = lines[ -1 ] + for thisline in lines[ :-1 ]: + ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline ) + if left_pos < 0 or not chromosome: + continue + assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline + m += right_pos - left_pos + i += 1 + if i % 1000000 == 0: + info( " %d fragments parsed" % i ) + add_loc( chromosome, left_pos, right_pos ) + # last one + if tmp: ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline ) - if left_pos < 0 or not chromosome: - continue - - assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline - m += right_pos - left_pos - i += 1 - - if i % 1000000 == 0: - info( " %d" % i ) - - add_loc( chromosome, left_pos, right_pos ) - + if left_pos >= 0 and chromosome: + assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline + i += 1 + m += right_pos - left_pos + add_loc( chromosome, left_pos, right_pos ) + self.d = ( m ) / i self.n = i - assert self.d >= 0, "Something went wrong (mean fragment size was negative)" self.close() @@ -648,29 +685,39 @@ cdef class BEDPEParser(GenericParser): int32_t right_pos int64_t i = 0 # number of fragments int64_t m = 0 # sum of fragment lengths + bytes tmp = b"" add_loc = petrack.add_loc - - for thisline in self.fhd: + while True: + # for each block of input + tmp += self.fhd.read( READ_BUFFER_SIZE ) + if not tmp: + break + lines = tmp.split(b"\n") + tmp = lines[ -1 ] + for thisline in lines[ :-1 ]: + ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline ) + if left_pos < 0 or not chromosome: + continue + assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline + m += right_pos - left_pos + i += 1 + if i % 1000000 == 0: + info( " %d fragments parsed" % i ) + add_loc( chromosome, left_pos, right_pos ) + # last one + if tmp: ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline ) - - if left_pos < 0 or not chromosome: - continue - - assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline - m += right_pos - left_pos - i += 1 - - if i % 1000000 == 0: - info( " %d" % i ) - - add_loc( chromosome, left_pos, right_pos ) + if left_pos >= 0 and chromosome: + assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline + i += 1 + m += right_pos - left_pos + add_loc( chromosome, left_pos, right_pos ) self.d = ( self.d * self.n + m ) / ( self.n + i ) self.n += i assert self.d >= 0, "Something went wrong (mean fragment size was negative)" - self.close() petrack.set_rlengths( {"DUMMYCHROM":0} ) return petrack @@ -1055,7 +1102,7 @@ cdef class BAMParser( GenericParser ): f.close() if self.gzipped: # open with gzip.open, then wrap it with BufferedReader! - self.fhd = io.BufferedReader( gzip.open( filename, mode='rb' ), buffer_size = 1048576) # buffersize set to 1M + self.fhd = io.BufferedReader( gzip.open( filename, mode='rb' ), buffer_size = READ_BUFFER_SIZE) # buffersize set to 1M else: self.fhd = io.open( filename, mode='rb' ) # binary mode! I don't expect unicode here! @@ -1174,12 +1221,12 @@ cdef class BAMParser( GenericParser ): entrylength = unpack( " Date: Sun, 6 Dec 2020 23:34:32 -0500 Subject: [PATCH 2/6] fix a typo --- MACS3/IO/Parser.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/MACS3/IO/Parser.pyx b/MACS3/IO/Parser.pyx index 5e844659..62c3f846 100644 --- a/MACS3/IO/Parser.pyx +++ b/MACS3/IO/Parser.pyx @@ -1,7 +1,7 @@ # cython: language_level=3 # cython: profile=True # cython: linetrace=True -# Time-stamp: <2020-12-06 23:24:53 Tao Liu> +# Time-stamp: <2020-12-06 23:32:28 Tao Liu> """Module for all MACS Parser classes for input. @@ -405,11 +405,10 @@ cdef class GenericParser: cdef: int64_t i, fpos, strand bytes chromosome - bytes tmp + bytes tmp = b"" fwtrack = FWTrack( buffer_size = self.buffer_size ) i = 0 - tmp = b"" while True: # for each block of input tmp += self.fhd.read( READ_BUFFER_SIZE ) @@ -444,9 +443,8 @@ cdef class GenericParser: cdef: int64_t i, fpos, strand bytes chromosome - bytes tmp + bytes tmp = b"" i = 0 - tmp = "b" while True: # for each block of input tmp += self.fhd.read( READ_BUFFER_SIZE ) From c85c3fd614acff3599e6ae3b82cb4c37d5faadf7 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Sun, 6 Dec 2020 23:55:41 -0500 Subject: [PATCH 3/6] rename pvalue_stat to pscore_stat --- MACS3/Signal/CallPeakUnit.pyx | 42 +++++++++++++++++------------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/MACS3/Signal/CallPeakUnit.pyx b/MACS3/Signal/CallPeakUnit.pyx index 8a976a56..4c9ebcde 100644 --- a/MACS3/Signal/CallPeakUnit.pyx +++ b/MACS3/Signal/CallPeakUnit.pyx @@ -1,7 +1,7 @@ # cython: language_level=3 # cython: profile=True # cython: linetrace=True -# Time-stamp: <2020-12-03 16:07:01 Tao Liu> +# Time-stamp: <2020-12-06 23:54:17 Tao Liu> """Module for Calculate Scores. @@ -617,7 +617,7 @@ cdef class CallerFromAlignments: cdef: bytes chrom np.ndarray pos_array, treat_array, ctrl_array, score_array - dict pvalue_stat + dict pscore_stat int64_t n, pre_p, length, pre_l, l, i, j float32_t this_v, pre_v, v, q, pre_q int64_t N, k, this_l @@ -629,7 +629,7 @@ cdef class CallerFromAlignments: logging.debug ( "Start to calculate pvalue stat..." ) - pvalue_stat = {} #dict() + pscore_stat = {} #dict() for i in range( len( self.chromosomes ) ): chrom = self.chromosomes[ i ] pre_p = 0 @@ -644,16 +644,16 @@ cdef class CallerFromAlignments: for j in range(pos_array.shape[0]): this_v = get_pscore( ((treat_value_ptr[0]), ctrl_value_ptr[0] ) ) this_l = pos_ptr[0] - pre_p - if this_v in pvalue_stat: - pvalue_stat[ this_v ] += this_l + if this_v in pscore_stat: + pscore_stat[ this_v ] += this_l else: - pvalue_stat[ this_v ] = this_l + pscore_stat[ this_v ] = this_l pre_p = pos_ptr[0] pos_ptr += 1 treat_value_ptr += 1 ctrl_value_ptr += 1 - N = sum(pvalue_stat.values()) # total length + N = sum(pscore_stat.values()) # total length k = 1 # rank f = -log10(N) pre_v = -2147483647 @@ -661,10 +661,10 @@ cdef class CallerFromAlignments: pre_q = 2147483647 # save the previous q-value self.pqtable = Float32to32Map( for_int = False ) - unique_values = sorted(list(pvalue_stat.keys()), reverse=True) + unique_values = sorted(list(pscore_stat.keys()), reverse=True) for i in range(len(unique_values)): v = unique_values[i] - l = pvalue_stat[v] + l = pscore_stat[v] q = v + (log10(k) + f) if q > pre_q: q = pre_q @@ -689,7 +689,7 @@ cdef class CallerFromAlignments: cdef: bytes chrom np.ndarray pos_array, treat_array, ctrl_array, score_array - dict pvalue_stat + dict pscore_stat int64_t n, pre_p, this_p, length, j, pre_l, l, i float32_t q, pre_q, this_t, this_c float32_t this_v, pre_v, v, cutoff @@ -714,8 +714,8 @@ cdef class CallerFromAlignments: # tmplist contains a list of log pvalue cutoffs from 0.3 to 10 tmplist = [round(x,5) for x in sorted( list(np.arange(0.3, 10.0, 0.3)), reverse = True )] - pvalue_stat = {} #dict() - #print (list(pvalue_stat.keys())) + pscore_stat = {} #dict() + #print (list(pscore_stat.keys())) #print (list(self.pvalue_length.keys())) #print (list(self.pvalue_npeaks.keys())) for i in range( len( self.chromosomes ) ): @@ -777,25 +777,25 @@ cdef class CallerFromAlignments: this_p = pos_array_ptr[ 0 ] this_l = this_p - pre_p this_v = score_array_ptr[ 0 ] - if this_v in pvalue_stat: - pvalue_stat[ this_v ] += this_l + if this_v in pscore_stat: + pscore_stat[ this_v ] += this_l else: - pvalue_stat[ this_v ] = this_l + pscore_stat[ this_v ] = this_l pre_p = this_p #pos_array[ i ] pos_array_ptr += 1 score_array_ptr += 1 - #logging.debug ( "make pvalue_stat cost %.5f seconds" % t ) + #logging.debug ( "make pscore_stat cost %.5f seconds" % t ) # add all pvalue cutoffs from cutoff-analysis part. So that we # can get the corresponding qvalues for them. for cutoff in tmplist: - if cutoff not in pvalue_stat: - pvalue_stat[ cutoff ] = 0 + if cutoff not in pscore_stat: + pscore_stat[ cutoff ] = 0 nhval = 0 - N = sum(pvalue_stat.values()) # total length + N = sum(pscore_stat.values()) # total length k = 1 # rank f = -log10(N) pre_v = -2147483647 @@ -803,10 +803,10 @@ cdef class CallerFromAlignments: pre_q = 2147483647 # save the previous q-value self.pqtable = Float32to32Map( for_int = False ) #{} - unique_values = sorted(list(pvalue_stat.keys()), reverse=True) #sorted(unique_values,reverse=True) + unique_values = sorted(list(pscore_stat.keys()), reverse=True) #sorted(unique_values,reverse=True) for i in range(len(unique_values)): v = unique_values[i] - l = pvalue_stat[v] + l = pscore_stat[v] q = v + (log10(k) + f) if q > pre_q: q = pre_q From cdc9b7ec04f75b9e3e801097aadd822dcee97e87 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Sun, 6 Dec 2020 23:55:51 -0500 Subject: [PATCH 4/6] update README --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 84f721a2..0c008b60 100644 --- a/README.md +++ b/README.md @@ -39,16 +39,18 @@ add more new features in the future.** ### 3.0.0a2 * Features - 1) Speed/memory optimization, including using the cykhash to - replace python dictionary + 1) Speed/memory optimization. Use the cykhash to replace python + dictionary. Use buffer (10MB) to read and parse input file (not + available for BAM file parser). And many optimization tweaks. - 2) Code cleanup + 2) Code cleanup. Reorganize source codes. - 3) Unit testing + 3) Unit testing. 4) R wrappers for MACS -- MACSr - 5) Switching to Github Action for CI, support multi-arch testing + 5) Switch to Github Action for CI, support multi-arch testing + including x64, armv7, aarch64, s390x and ppc64le. 6) MACS tag-shifting model has been refined. Now it will use a naive peak calling approach to find ALL possible paired peaks at + From 5f21c5055c8209888ae37d2dc2a6885565452394 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Mon, 7 Dec 2020 00:00:55 -0500 Subject: [PATCH 5/6] update documents --- ChangeLog | 27 +++++++++++++++++++++++++++ README.md | 8 +++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 0f87e62e..d9c8659b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,30 @@ +2020-12-06 Tao Liu + MACS 3.0.0a2 + + * New features: + + 1) Speed/memory optimization. Use the cykhash to replace python + dictionary. Use buffer (10MB) to read and parse input file (not + available for BAM file parser). And many optimization tweaks. + + 2) Code cleanup. Reorganize source codes. + + 3) Unit testing. + + 4) R wrappers for MACS -- MACSr + + 5) Switch to Github Action for CI, support multi-arch testing + including x64, armv7, aarch64, s390x and ppc64le. + + 6) MACS tag-shifting model has been refined. Now it will use a + naive peak calling approach to find ALL possible paired peaks at + + and - strand, then use all of them to calculate the + cross-correlation. + + 7) Call variants in peak regions directly from BAM files. The + function was originally developed under code name SAPPER. Now + SAPPER has been merged into MACS. + 2020-04-11 Tao Liu MACS version 2.2.7.1 diff --git a/README.md b/README.md index 0c008b60..1a27a8ac 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ add more new features in the future.** ## Recent Changes for MACS (3.0.0a2) ### 3.0.0a2 - * Features + * New features 1) Speed/memory optimization. Use the cykhash to replace python dictionary. Use buffer (10MB) to read and parse input file (not @@ -58,8 +58,10 @@ add more new features in the future.** cross-correlation. 7) Call variants in peak regions directly from BAM files. The - function was originally developed under code name SAPPER. Now - SAPPER has been merged into MACS. + function was originally developed under code name SAPPER. Now + SAPPER has been merged into MACS. Also, `simde` has been added as + a submodule in order to support fermi-lite library under non-x64 + architectures. ## Install From 575c9a6952c87a0b81b3b21ed3bec29864037cc9 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Mon, 7 Dec 2020 00:11:20 -0500 Subject: [PATCH 6/6] update manifest.in and license --- LICENSE | 4 +++- MANIFEST.in | 4 +--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/LICENSE b/LICENSE index 66d7a751..02123625 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,8 @@ BSD 3-Clause License -Copyright (c) 2019, Tao Liu lab at Roswell Park Comprehensive Cancer Center and Xiaole Shirley Liu lab at Dana-Farber Cancer Institute, All rights reserved. +Copyright (c) 2020, Tao Liu lab at Roswell Park Comprehensive Cancer +Center and Xiaole Shirley Liu lab at Dana-Farber Cancer Institute, All +rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/MANIFEST.in b/MANIFEST.in index ca60b9d2..b5b15b64 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,5 @@ include README.md LICENSE ChangeLog MANIFEST.in setup.py bin/macs3 recursive-include MACS3 *.py *.pyx *.pxd *.c *.h recursive-include docs *.md -exclude .gitignore .travis.yml prune test -prune .github -prune DOCKER +prune MACS3/fermi-lite/lib/test/