Skip to content

Commit

Permalink
Merge pull request #527 from macs3-project/release/macs3/3.0.0
Browse files Browse the repository at this point in the history
Release/macs3/3.0.0b1
  • Loading branch information
taoliu authored Oct 5, 2022
2 parents a8fc4d4 + 27e4cb3 commit f830bcb
Show file tree
Hide file tree
Showing 6 changed files with 337 additions and 1,243 deletions.
42 changes: 16 additions & 26 deletions MACS3/Commands/hmmratac_cmd.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2022-09-23 01:21:12 taoliu>
# Time-stamp: <2022-10-04 16:58:30 Tao Liu>

"""Description: Main HMMR command
Expand Down Expand Up @@ -30,7 +30,7 @@
from MACS3.IO.Parser import BAMPEParser #BAMaccessor
from MACS3.Signal.HMMR_EM import HMMR_EM
from MACS3.Signal.HMMR_Signal_Processing import generate_weight_mapping, generate_digested_signals, extract_signals_from_regions
from MACS3.Signal.HMMR_HMM import hmm_training, hmm_predict, hmm_model_init
from MACS3.Signal.HMMR_HMM import hmm_training, hmm_predict, hmm_model_init, hmm_model_save
from MACS3.Signal.Region import Regions
from MACS3.Signal.BedGraph import bedGraphTrackI

Expand Down Expand Up @@ -63,7 +63,7 @@ def run( args ):
training_region_bedfile = os.path.join( options.outdir, options.name+"_training_regions.bed" )
training_datafile = os.path.join( options.outdir, options.name+"_training_data.txt" )
training_datalengthfile = os.path.join( options.outdir, options.name+"_training_lengths.txt" )
hmm_modelfile = os.path.join( options.outdir, options.name+"_model.txt" )
hmm_modelfile = os.path.join( options.outdir, options.name+"_model.json" )
open_state_bdgfile = os.path.join( options.outdir, options.name+"_open.bdg" )
nuc_state_bdgfile = os.path.join( options.outdir, options.name+"_nuc.bdg" )
bg_state_bdgfile = os.path.join( options.outdir, options.name+"_bg.bdg" )
Expand Down Expand Up @@ -217,7 +217,7 @@ def run( args ):
options.info( f"#4 Load Hidden Markov Model from given model file")
hmm_model, i_open_region, i_background_region, i_nucleosomal_region, options.hmm_binsize = hmm_model_init( options.hmm_file )
else:
options.info( f"#4 Train Hidden Markov Model with Gaussian Emission" )
options.info( f"#4 Train Hidden Markov Model with Multivariate Gaussian Emission" )

# extract signals within peak using the given binsize
options.info( f"# Extract signals in training regions with bin size of {options.hmm_binsize}")
Expand All @@ -238,32 +238,25 @@ def run( args ):

options.info( f"# Use Baum-Welch algorithm to train the HMM")

hmm_model = hmm_training( training_data, training_data_lengths, random_seed = options.hmm_randomSeed )
hmm_model = hmm_training( training_data, training_data_lengths, random_seed = options.hmm_randomSeed, covar="full" )

options.info( f"# HMM converged: {hmm_model.monitor_.converged}")

# label hidden states
i_open_region = np.where(hmm_model.means_ == max(hmm_model.means_[0:3,0]))[0][0]
i_background_region = np.where(hmm_model.transmat_ == min(hmm_model.transmat_[0:3, i_open_region]))[0][0]
means_sum = np.sum( hmm_model.means_, axis=1 )

# first, the state with the highest overall emission is the open state
i_open_region = np.where( means_sum == max(means_sum) )[0][0]

# second, the state with lowest overall emission is the bg state
i_background_region = np.where( means_sum == min(means_sum) )[0][0]

# last one is the nuc state (note it may not be accurate though
i_nucleosomal_region = list(set([0, 1, 2]) - set([i_open_region, i_background_region]))[0]

# write hmm into model file
options.info( f"# Write HMM parameters into {hmm_modelfile}")
f = open( hmm_modelfile, "w" )
f.write( str(hmm_model.startprob_)+"\n\n\n" )
f.write( str(hmm_model.transmat_ )+"\n\n\n" )
f.write( str(hmm_model.means_ )+"\n\n\n" )
f.write( str(hmm_model.covars_ )+"\n\n\n" )
f.write( str(hmm_model.n_features )+"\n\n\n" )
f.write( str(i_open_region )+"\n\n\n" )
f.write( str(i_background_region )+"\n\n\n" )
f.write( str(i_nucleosomal_region )+"\n\n\n" )
f.write( str(options.hmm_binsize )+"\n\n\n" )

#f.write( 'open region = state ' + str(i_open_region)+"\n" )
#f.write( 'nucleosomal region = state ' + str(i_nucleosomal_region)+"\n" )
#f.write( 'background region = state ' + str(i_background_region)+"\n" )
f.close()
options.info( f"# Write HMM parameters into JSON: {hmm_modelfile}")
hmm_model_save( hmm_modelfile, hmm_model, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region )

# Now tell users the parameters of the HMM
assignments = [ "", "", "" ]
Expand All @@ -289,7 +282,6 @@ def run( args ):
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.means_[1]) )
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.means_[2]) )

#options.info( f"# HMM Emissions (covar): {hmm_model.covars_}")

#############################################
# 5. Predict
Expand Down Expand Up @@ -438,8 +430,6 @@ def generate_states_path( candidate_bins, predicted_proba, binsize, i_open_regio
else:
start_pos = candidate_bins[l][1]-binsize



return ret_states_path

def save_accessible_regions( states_path, accessible_region_file, openregion_minlen ):
Expand Down
4 changes: 2 additions & 2 deletions MACS3/Signal/BedGraph.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# cython: language_level=3
# cython: profile=True
# Time-stamp: <2022-09-29 09:07:23 Tao Liu>
# Time-stamp: <2022-10-04 15:46:00 Tao Liu>

"""Module for BedGraph data class.
Expand Down Expand Up @@ -981,7 +981,7 @@ cdef class bedGraphTrackI:
int32_t pre_p, p1, p2, i
float32_t v1, v2
bytes chrom
object ret
list ret

assert isinstance(bdgTrack2,bedGraphTrackI), "not a bedGraphTrackI object"

Expand Down
70 changes: 33 additions & 37 deletions MACS3/Signal/HMMR_HMM.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# cython: language_level=3
# cython: profile=True
# Time-stamp: <2022-09-30 16:13:22 Tao Liu>
# Time-stamp: <2022-10-04 15:14:15 Tao Liu>

"""Module description:
Expand All @@ -25,9 +25,6 @@ from cpython cimport bool
from hmmlearn import hmm
import json
# from hmmlearn cimport hmm
# from sklearn.cluster import KMeans
# from sklearn.cluster cimport KMeans


# ------------------------------------
# MACS3 modules
Expand Down Expand Up @@ -59,49 +56,48 @@ cdef inline float get_weighted_density( int x, float m, float v, w ):
# ------------------------------------


cpdef hmm_training( list training_data, list training_data_lengths, int n_states = 3, int random_seed = 12345, mixedmodel = False, covar = 'full' ):
cpdef hmm_training( list training_data, list training_data_lengths, int n_states = 3, int random_seed = 12345, covar = 'full' ):
# training data should be in array like format: X = np.array([[.5, .3, .1, .1], [.6, .4, 0, 0]])
# if we do not want init_prob to be updated through learning, set params = 'tmc' and init_prob = initial_state otherwise it will be overwritten
# according to base documentation, if init_prob not stated, it is set to be equally likely for any state (1/ # of components)
# if we have other known parameters, we should set these (ie: means_weights, covariance_type etc.)
rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(random_seed)))
if mixedmodel:
hmm_model = hmm.GMMHMM( n_components = n_states, covariance_type = covar, random_state = rs, verbose = False )
else:
hmm_model = hmm.GaussianHMM( n_components= n_states, covariance_type = covar, random_state = rs, verbose = False )
hmm_model = hmm.GaussianHMM( n_components= n_states, covariance_type = covar, random_state = rs, verbose = False )
hmm_model = hmm_model.fit( training_data, training_data_lengths )

#hmm_model.transmat_ = np.around(hmm_model.transmat_, decimals = roundup)
#hmm_model.means_ = np.around(hmm_model.means_, decimals = roundup)
#hmm_model.covars_ = np.around(hmm_model.covars_, decimals = roundup)
assert hmm_model.n_features == 4
return hmm_model

cpdef hmm_predict( list signals, list lens, hmm_model ):
predictions = hmm_model.predict_proba( signals, lens )
#print( len(predictions), len(signals) )
#print( sum( lens ) )
#print( predictions, signals )
return predictions

cpdef hmm_model_init( model_file ):
f = open(model_file, 'r')
model_txt = f.read()
model_txt = model_txt.replace(' ', ' ').replace('[ ', '[').replace(' ', ',').replace('\n\n\n', ' $ ').replace('\n', '')
a,b,c,d,e,f,g,h,i = model_txt.split(" $ ")[0:9]
startprob = np.array(json.loads(a))
transmat = np.array(json.loads(b))
means = np.array(json.loads(c))
covars = np.array(json.loads(d))
n_features = int(e)
i_open_region = int(f)
i_background_region = int(g)
i_nucleosomal_region = int(h)
binsize = int(i)
cpdef void hmm_model_save( str model_file, object hmm_model, int hmm_binsize, int i_open_region, int i_nucleosomal_region, int i_background_region ):
if hmm_model.covariance_type == "diag":
covars = hmm_model.covars_.diagonal(axis1=1, axis2=2)
elif hmm_model.covariance_type == "full":
covars = hmm_model.covars_
else:
raise Exception(f"Unknown covariance type {hmm_model.covariance_type}")
with open( model_file, "w" ) as f:
json.dump( {"startprob":hmm_model.startprob_.tolist(),
"transmat":hmm_model.transmat_.tolist(),
"means":hmm_model.means_.tolist(),
"covars":covars.tolist(),
"covariance_type":hmm_model.covariance_type,
"n_features":int(hmm_model.n_features),
"i_open_region":int(i_open_region),
"i_background_region":int(i_background_region),
"i_nucleosomal_region":int(i_nucleosomal_region),
"hmm_binsize":int(hmm_binsize)}, f )

hmm_model = hmm.GaussianHMM( n_components=3, covariance_type='full' ) #change 3 to variable
hmm_model.startprob_ = startprob
hmm_model.transmat_ = transmat
hmm_model.means_ = means
hmm_model.covars_ = covars
hmm_model.n_features = n_features
return hmm_model, i_open_region, i_background_region, i_nucleosomal_region, binsize
cpdef list hmm_model_init( str model_file ):
with open( model_file ) as f:
m = json.load( f )
hmm_model = hmm.GaussianHMM( n_components=3, covariance_type=m["covariance_type"] )
hmm_model.startprob_ = np.array(m["startprob"])
hmm_model.transmat_ = np.array(m["transmat"])
hmm_model.means_ = np.array(m["means"])
hmm_model.covars_ = np.array(m["covars"])
hmm_model.covariance_type = m["covariance_type"]
hmm_model.n_features = m["n_features"]
return [ hmm_model, m["i_open_region"], m["i_background_region"], m["i_nucleosomal_region"], m["hmm_binsize"] ]
14 changes: 5 additions & 9 deletions MACS3/Signal/HMMR_Signal_Processing.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# cython: language_level=3
# cython: profile=True
# Time-stamp: <2022-09-29 14:58:12 Tao Liu>
# Time-stamp: <2022-10-04 16:14:23 Tao Liu>

"""Module description:
Expand Down Expand Up @@ -155,23 +155,19 @@ cpdef list extract_signals_from_regions( list signals, object regions, int binsi
assert len( extracted_data[0] ) == len( extracted_data[1] )
assert len( extracted_data[0] ) == len( extracted_data[2] )
assert len( extracted_data[0] ) == len( extracted_data[3] )
#nnn =len( extracted_len[0] )
#debug( f"{n} bins, {nn}, {nnn}" )
counter = 0
prev_c = extracted_len[0][0]
c = 0
for i in range( nn ):
ret_training_bins.append( extracted_positions[0][i] )
ret_training_data.append(
[ max( 0.0001, abs(round(extracted_data[0][i], 4))),
max( 0.0001, abs(round(extracted_data[1][i], 4))),
max( 0.0001, abs(round(extracted_data[2][i], 4))),
max( 0.0001, abs(round(extracted_data[3][i], 4))) ] )
[ max( 0.0001, extracted_data[0][i] ),
max( 0.0001, extracted_data[1][i] ),
max( 0.0001, extracted_data[2][i] ),
max( 0.0001, extracted_data[3][i] ) ] )
c = extracted_len[0][i]
#print(f"{extracted_positions[0][i]} {extracted_len[0][i]}")
if counter != 0 and c != prev_c:
ret_training_lengths.append( counter )
#print(f"### add a bin length {counter}")
counter = 0
prev_c = c
counter += 1
Expand Down
12 changes: 4 additions & 8 deletions bin/macs3
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# Time-stamp: <2022-09-17 11:06:32 Tao Liu>
# Time-stamp: <2022-10-04 13:48:15 Tao Liu>

"""Description: MACS v3 main executable.
Expand Down Expand Up @@ -805,14 +805,14 @@ def add_hmmratac_parser( subparsers ):
# help = "Size of the bins to split the genome into for Viterbi decoding. To save memory, the genome is split into WINDOW long bins and viterbi decoding occurs across each bin. Default = 25000000. Note: For machines with limited memory, it is recommended to reduce the size of the bins.",
# default = 25000000 )
group_hmm.add_argument( "--model", dest = "hmm_file", type = str, required = False,
help = "Binary model file (generated from previous HMMR run) to use instead of creating new one. When provided, HMM training will be skipped. Default: NA" )
help = "A JSON file generated from previous HMMRATAC run to use instead of creating new one. When provided, HMM training will be skipped. Default: NA" )
#group_hmm.add_argument( "--modelonly", dest = "hmm_modelonly", action = "store_true", default = False,
# help = "Stop the program after generating model. Use this option to generate HMM model ONLY, which can be later applied with `--model`. Default: False")

# group for peak calling arguments
group_call = argparser_hmmratac.add_argument_group( "Peak calling arguments" )
group_call.add_argument( "-c", "--prescan-cutoff", dest = "prescan_cutoff", type = float,
help = "The fold change cutoff for prescanning candidate regions in the whole dataset. The we will use HMM to predict states on these candidate regions. Higher the prescan cutoff, fewer regions will be considered. Must > 1. Default: 1.2",
help = "The fold change cutoff for prescanning candidate regions in the whole dataset. Then we will use HMM to predict states on these candidate regions. Higher the prescan cutoff, fewer regions will be considered. Must > 1. Default: 1.2",
default = 1.2 )

group_call.add_argument( "--minlen", dest = "openregion_minlen", type = int,
Expand Down Expand Up @@ -861,9 +861,5 @@ if __name__ == '__main__':
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupted me! ;-) Bye!\n")
sys.exit(0)
except MemoryError:
sys.exit( "MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
sys.exit(1)
except ValueError:
sys.exit(1)
sys.stderr.write( "MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
Loading

0 comments on commit f830bcb

Please sign in to comment.