Merge pull request #527 from macs3-project/release/macs3/3.0.0

Release/macs3/3.0.0b1
macs3-project · Oct 5, 2022 · f830bcb · f830bcb
2 parents a8fc4d4 + 27e4cb3
commit f830bcb
Show file tree

Hide file tree

Showing 6 changed files with 337 additions and 1,243 deletions.
diff --git a/MACS3/Commands/hmmratac_cmd.py b/MACS3/Commands/hmmratac_cmd.py
@@ -1,4 +1,4 @@
-# Time-stamp: <2022-09-23 01:21:12 taoliu>
+# Time-stamp: <2022-10-04 16:58:30 Tao Liu>
 
 """Description: Main HMMR command
 
@@ -30,7 +30,7 @@
 from MACS3.IO.Parser import BAMPEParser #BAMaccessor
 from MACS3.Signal.HMMR_EM import HMMR_EM
 from MACS3.Signal.HMMR_Signal_Processing import generate_weight_mapping, generate_digested_signals, extract_signals_from_regions
-from MACS3.Signal.HMMR_HMM import hmm_training, hmm_predict, hmm_model_init
+from MACS3.Signal.HMMR_HMM import hmm_training, hmm_predict, hmm_model_init, hmm_model_save
 from MACS3.Signal.Region import Regions
 from MACS3.Signal.BedGraph import bedGraphTrackI
 
@@ -63,7 +63,7 @@ def run( args ):
     training_region_bedfile = os.path.join( options.outdir, options.name+"_training_regions.bed" )
     training_datafile = os.path.join( options.outdir, options.name+"_training_data.txt" )
     training_datalengthfile = os.path.join( options.outdir, options.name+"_training_lengths.txt" )
-    hmm_modelfile = os.path.join( options.outdir, options.name+"_model.txt" )
+    hmm_modelfile = os.path.join( options.outdir, options.name+"_model.json" )
     open_state_bdgfile = os.path.join( options.outdir, options.name+"_open.bdg" )
     nuc_state_bdgfile = os.path.join( options.outdir, options.name+"_nuc.bdg" )
     bg_state_bdgfile = os.path.join( options.outdir, options.name+"_bg.bdg" )
@@ -217,7 +217,7 @@ def run( args ):
         options.info( f"#4 Load Hidden Markov Model from given model file")
         hmm_model, i_open_region, i_background_region, i_nucleosomal_region, options.hmm_binsize = hmm_model_init( options.hmm_file )
     else:
-        options.info( f"#4 Train Hidden Markov Model with Gaussian Emission" )
+        options.info( f"#4 Train Hidden Markov Model with Multivariate Gaussian Emission" )
 
         # extract signals within peak using the given binsize
         options.info( f"#  Extract signals in training regions with bin size of {options.hmm_binsize}")
@@ -238,32 +238,25 @@ def run( args ):
 
         options.info( f"#  Use Baum-Welch algorithm to train the HMM")
 
-        hmm_model = hmm_training( training_data, training_data_lengths, random_seed = options.hmm_randomSeed )
+        hmm_model = hmm_training( training_data, training_data_lengths, random_seed = options.hmm_randomSeed, covar="full" )
 
         options.info( f"#   HMM converged: {hmm_model.monitor_.converged}")
 
         # label hidden states
-        i_open_region = np.where(hmm_model.means_ == max(hmm_model.means_[0:3,0]))[0][0]
-        i_background_region = np.where(hmm_model.transmat_ == min(hmm_model.transmat_[0:3, i_open_region]))[0][0]
+        means_sum = np.sum( hmm_model.means_, axis=1 )
+
+        # first, the state with the highest overall emission is the open state
+        i_open_region = np.where( means_sum == max(means_sum) )[0][0]
+
+        # second, the state with lowest overall emission is the bg state 
+        i_background_region = np.where( means_sum == min(means_sum) )[0][0]
+
+        # last one is the nuc state (note it may not be accurate though
         i_nucleosomal_region = list(set([0, 1, 2]) - set([i_open_region, i_background_region]))[0]
 
         # write hmm into model file
-        options.info( f"#  Write HMM parameters into {hmm_modelfile}")        
-        f = open( hmm_modelfile, "w" )
-        f.write( str(hmm_model.startprob_)+"\n\n\n" )
-        f.write( str(hmm_model.transmat_ )+"\n\n\n" )
-        f.write( str(hmm_model.means_ )+"\n\n\n" )
-        f.write( str(hmm_model.covars_ )+"\n\n\n" )
-        f.write( str(hmm_model.n_features )+"\n\n\n" )
-        f.write( str(i_open_region )+"\n\n\n" )
-        f.write( str(i_background_region )+"\n\n\n" )
-        f.write( str(i_nucleosomal_region )+"\n\n\n" )
-        f.write( str(options.hmm_binsize )+"\n\n\n" )
-
-        #f.write( 'open region = state ' + str(i_open_region)+"\n" )
-        #f.write( 'nucleosomal region = state ' + str(i_nucleosomal_region)+"\n" )
-        #f.write( 'background region = state ' + str(i_background_region)+"\n" )
-        f.close()
+        options.info( f"#  Write HMM parameters into JSON: {hmm_modelfile}")
+        hmm_model_save( hmm_modelfile, hmm_model, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region )
 
     # Now tell users the parameters of the HMM
     assignments = [ "", "", "" ]
@@ -289,7 +282,6 @@ def run( args ):
     options.info(  "#       {0:>10s}:  {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.means_[1]) )
     options.info(  "#       {0:>10s}:  {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.means_[2]) )
 
-    #options.info( f"#   HMM Emissions (covar): {hmm_model.covars_}")
 
 #############################################
 # 5. Predict
@@ -438,8 +430,6 @@ def generate_states_path( candidate_bins, predicted_proba, binsize, i_open_regio
         else:
             start_pos = candidate_bins[l][1]-binsize
 
-
-
     return ret_states_path
 
 def save_accessible_regions( states_path, accessible_region_file, openregion_minlen ):

diff --git a/MACS3/Signal/BedGraph.pyx b/MACS3/Signal/BedGraph.pyx
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2022-09-29 09:07:23 Tao Liu>
+# Time-stamp: <2022-10-04 15:46:00 Tao Liu>
 
 """Module for BedGraph data class.
 
@@ -981,7 +981,7 @@ cdef class bedGraphTrackI:
              int32_t pre_p, p1, p2, i
              float32_t v1, v2
              bytes chrom
-             object ret
+             list ret
 
         assert isinstance(bdgTrack2,bedGraphTrackI), "not a bedGraphTrackI object"
 

diff --git a/MACS3/Signal/HMMR_HMM.pyx b/MACS3/Signal/HMMR_HMM.pyx
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2022-09-30 16:13:22 Tao Liu>
+# Time-stamp: <2022-10-04 15:14:15 Tao Liu>
 
 """Module description:
 
@@ -25,9 +25,6 @@ from cpython cimport bool
 from hmmlearn import hmm
 import json
 # from hmmlearn cimport hmm
-# from sklearn.cluster import KMeans 
-# from sklearn.cluster cimport KMeans
-
 
 # ------------------------------------
 # MACS3 modules
@@ -59,49 +56,48 @@ cdef inline float get_weighted_density( int x, float m, float v, w ):
 # ------------------------------------
 
 
-cpdef hmm_training( list training_data, list training_data_lengths, int n_states = 3, int random_seed = 12345, mixedmodel = False, covar = 'full' ):
+cpdef hmm_training( list training_data, list training_data_lengths, int n_states = 3, int random_seed = 12345, covar = 'full' ):
     # training data should be in array like format: X = np.array([[.5, .3, .1, .1], [.6, .4, 0, 0]])
     # if we do not want init_prob to be updated through learning, set params = 'tmc' and init_prob = initial_state otherwise it will be overwritten
     # according to base documentation, if init_prob not stated, it is set to be equally likely for any state (1/ # of components)
     # if we have other known parameters, we should set these (ie: means_weights, covariance_type etc.)
     rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(random_seed)))
-    if mixedmodel:
-        hmm_model = hmm.GMMHMM( n_components = n_states, covariance_type = covar, random_state = rs, verbose = False )
-    else:
-        hmm_model = hmm.GaussianHMM( n_components= n_states, covariance_type = covar, random_state = rs, verbose = False )
+    hmm_model = hmm.GaussianHMM( n_components= n_states, covariance_type = covar, random_state = rs, verbose = False )
     hmm_model = hmm_model.fit( training_data, training_data_lengths )
-
-    #hmm_model.transmat_ = np.around(hmm_model.transmat_, decimals = roundup)
-    #hmm_model.means_ = np.around(hmm_model.means_, decimals = roundup)
-    #hmm_model.covars_ = np.around(hmm_model.covars_, decimals = roundup)
+    assert hmm_model.n_features == 4
     return hmm_model
 
 cpdef hmm_predict( list signals, list lens, hmm_model ):
     predictions = hmm_model.predict_proba( signals, lens )
-    #print( len(predictions), len(signals) )
-    #print( sum( lens ) )
-    #print( predictions, signals )
     return predictions
 
-cpdef hmm_model_init( model_file ):
-    f = open(model_file, 'r')
-    model_txt = f.read()
-    model_txt = model_txt.replace('  ', ' ').replace('[ ', '[').replace(' ', ',').replace('\n\n\n', ' $ ').replace('\n', '')
-    a,b,c,d,e,f,g,h,i = model_txt.split(" $ ")[0:9]
-    startprob = np.array(json.loads(a))
-    transmat = np.array(json.loads(b))
-    means = np.array(json.loads(c))
-    covars = np.array(json.loads(d))
-    n_features = int(e)
-    i_open_region = int(f)
-    i_background_region = int(g)
-    i_nucleosomal_region = int(h)
-    binsize = int(i)
+cpdef void hmm_model_save( str model_file, object hmm_model, int hmm_binsize, int i_open_region, int i_nucleosomal_region, int i_background_region  ):
+    if hmm_model.covariance_type == "diag":
+        covars = hmm_model.covars_.diagonal(axis1=1, axis2=2)
+    elif hmm_model.covariance_type == "full":
+        covars = hmm_model.covars_
+    else:
+        raise Exception(f"Unknown covariance type {hmm_model.covariance_type}")
+    with open( model_file, "w" ) as f:
+        json.dump( {"startprob":hmm_model.startprob_.tolist(),
+                    "transmat":hmm_model.transmat_.tolist(),
+                    "means":hmm_model.means_.tolist(),
+                    "covars":covars.tolist(),
+                    "covariance_type":hmm_model.covariance_type,
+                    "n_features":int(hmm_model.n_features),
+                    "i_open_region":int(i_open_region),
+                    "i_background_region":int(i_background_region),
+                    "i_nucleosomal_region":int(i_nucleosomal_region),
+                    "hmm_binsize":int(hmm_binsize)}, f )
 
-    hmm_model = hmm.GaussianHMM( n_components=3, covariance_type='full' ) #change 3 to variable
-    hmm_model.startprob_ = startprob
-    hmm_model.transmat_ = transmat
-    hmm_model.means_ = means
-    hmm_model.covars_ = covars
-    hmm_model.n_features = n_features
-    return hmm_model, i_open_region, i_background_region, i_nucleosomal_region, binsize
+cpdef list hmm_model_init( str model_file ):
+    with open( model_file ) as f:
+        m = json.load( f )
+        hmm_model = hmm.GaussianHMM( n_components=3, covariance_type=m["covariance_type"] )
+        hmm_model.startprob_ = np.array(m["startprob"])
+        hmm_model.transmat_ = np.array(m["transmat"])
+        hmm_model.means_ = np.array(m["means"])
+        hmm_model.covars_ = np.array(m["covars"])
+        hmm_model.covariance_type = m["covariance_type"]
+        hmm_model.n_features = m["n_features"]
+        return [ hmm_model, m["i_open_region"], m["i_background_region"], m["i_nucleosomal_region"], m["hmm_binsize"] ]
diff --git a/MACS3/Signal/HMMR_Signal_Processing.pyx b/MACS3/Signal/HMMR_Signal_Processing.pyx
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2022-09-29 14:58:12 Tao Liu>
+# Time-stamp: <2022-10-04 16:14:23 Tao Liu>
 
 """Module description:
 
@@ -155,23 +155,19 @@ cpdef list extract_signals_from_regions( list signals, object regions, int binsi
     assert len( extracted_data[0] ) == len( extracted_data[1] )
     assert len( extracted_data[0] ) == len( extracted_data[2] )
     assert len( extracted_data[0] ) == len( extracted_data[3] )
-    #nnn =len( extracted_len[0] )
-    #debug( f"{n} bins, {nn}, {nnn}" )
     counter = 0
     prev_c = extracted_len[0][0]
     c = 0
     for i in range( nn ):
         ret_training_bins.append( extracted_positions[0][i] )
         ret_training_data.append(
-            [ max( 0.0001, abs(round(extracted_data[0][i], 4))),
-              max( 0.0001, abs(round(extracted_data[1][i], 4))),
-              max( 0.0001, abs(round(extracted_data[2][i], 4))),
-              max( 0.0001, abs(round(extracted_data[3][i], 4))) ] )
+            [ max( 0.0001, extracted_data[0][i] ),
+              max( 0.0001, extracted_data[1][i] ),
+              max( 0.0001, extracted_data[2][i] ),
+              max( 0.0001, extracted_data[3][i] ) ] )
         c = extracted_len[0][i]
-        #print(f"{extracted_positions[0][i]} {extracted_len[0][i]}")
         if counter != 0 and c != prev_c:
             ret_training_lengths.append( counter )
-            #print(f"### add a bin length {counter}")
             counter = 0
         prev_c = c
         counter +=  1

diff --git a/bin/macs3 b/bin/macs3
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Time-stamp: <2022-09-17 11:06:32 Tao Liu>
+# Time-stamp: <2022-10-04 13:48:15 Tao Liu>
 
 """Description: MACS v3 main executable.
 
@@ -805,14 +805,14 @@ def add_hmmratac_parser( subparsers ):
     #                        help = "Size of the bins to split the genome into for Viterbi decoding. To save memory, the genome is split into WINDOW long bins and viterbi decoding occurs across each bin. Default = 25000000. Note: For machines with limited memory, it is recommended to reduce the size of the bins.",
     #                        default = 25000000 )
     group_hmm.add_argument( "--model", dest = "hmm_file", type = str, required = False, 
-                            help = "Binary model file (generated from previous HMMR run) to use instead of creating new one. When provided, HMM training will be skipped. Default: NA" )
+                            help = "A JSON file generated from previous HMMRATAC run to use instead of creating new one. When provided, HMM training will be skipped. Default: NA" )
     #group_hmm.add_argument( "--modelonly", dest = "hmm_modelonly", action = "store_true", default = False,
     #                        help = "Stop the program after generating model. Use this option to generate HMM model ONLY, which can be later applied with `--model`. Default: False")
 
     # group for peak calling arguments
     group_call = argparser_hmmratac.add_argument_group( "Peak calling arguments" )
     group_call.add_argument( "-c", "--prescan-cutoff", dest = "prescan_cutoff", type = float,
-                             help = "The fold change cutoff for prescanning candidate regions in the whole dataset. The we will use HMM to predict states on these candidate regions. Higher the prescan cutoff, fewer regions will be considered. Must > 1. Default: 1.2",
+                             help = "The fold change cutoff for prescanning candidate regions in the whole dataset. Then we will use HMM to predict states on these candidate regions. Higher the prescan cutoff, fewer regions will be considered. Must > 1. Default: 1.2",
                              default = 1.2 )
 
     group_call.add_argument( "--minlen", dest = "openregion_minlen", type = int,
@@ -861,9 +861,5 @@ if __name__ == '__main__':
         main()
     except KeyboardInterrupt:
         sys.stderr.write("User interrupted me! ;-) Bye!\n")
-        sys.exit(0)
     except MemoryError:
-        sys.exit( "MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
-        sys.exit(1)
-    except ValueError:
-        sys.exit(1)
+        sys.stderr.write( "MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )