Skip to content

Commit

Permalink
Merge pull request #635 from macs3-project/feat/macs3/hmm_poisson
Browse files Browse the repository at this point in the history
Feat/macs3/hmm poisson
  • Loading branch information
taoliu authored Mar 28, 2024
2 parents 27e3430 + 82917c4 commit 880ab61
Show file tree
Hide file tree
Showing 12 changed files with 3,619 additions and 53 deletions.
34 changes: 23 additions & 11 deletions MACS3/Commands/hmmratac_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,15 +274,16 @@ def run( args ):
# 4. Train HMM
#############################################
# if model file is provided, we skip this step
# include options.hmm_type and make it backwards compatible, if no hmm_type default is gaussian
if options.hmm_file:
options.info( f"#4 Load Hidden Markov Model from given model file")
hmm_model, i_open_region, i_background_region, i_nucleosomal_region, options.hmm_binsize = hmm_model_init( options.hmm_file )
hmm_model, i_open_region, i_background_region, i_nucleosomal_region, options.hmm_binsize, options.hmm_type = hmm_model_init( options.hmm_file )
else:
options.info( f"#4 Train Hidden Markov Model with Multivariate Gaussian Emission" )

# extract signals within peak using the given binsize
options.info( f"# Extract signals in training regions with bin size of {options.hmm_binsize}")
[ training_bins, training_data, training_data_lengths ] = extract_signals_from_regions( digested_atac_signals, training_regions, binsize = options.hmm_binsize )
[ training_bins, training_data, training_data_lengths ] = extract_signals_from_regions( digested_atac_signals, training_regions, binsize = options.hmm_binsize, hmm_type = options.hmm_type )

if options.save_train:
f = open( training_datafile, "w" )
Expand All @@ -299,12 +300,15 @@ def run( args ):

options.info( f"# Use Baum-Welch algorithm to train the HMM")

hmm_model = hmm_training( training_data, training_data_lengths, random_seed = options.hmm_randomSeed, covar="full" )
hmm_model = hmm_training( training_data, training_data_lengths, random_seed = options.hmm_randomSeed, hmm_type = options.hmm_type )

options.info( f"# HMM converged: {hmm_model.monitor_.converged}")

# label hidden states
means_sum = np.sum( hmm_model.means_, axis=1 )
if options.hmm_type == "gaussian":
means_sum = np.sum( hmm_model.means_, axis=1 )
if options.hmm_type == "poisson":
means_sum = np.sum( hmm_model.lambdas_, axis=1 )

# first, the state with the highest overall emission is the open state
i_open_region = np.where( means_sum == max(means_sum) )[0][0]
Expand All @@ -317,7 +321,7 @@ def run( args ):

# write hmm into model file
options.info( f"# Write HMM parameters into JSON: {hmm_modelfile}")
hmm_model_save( hmm_modelfile, hmm_model, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region )
hmm_model_save( hmm_modelfile, hmm_model, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region, options.hmm_type )

# if --modelonly option provided, exit script after hmm model is saved
if options.hmm_modelonly:
Expand All @@ -342,11 +346,19 @@ def run( args ):
options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[0], hmm_model.transmat_[0]) )
options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[1], hmm_model.transmat_[1]) )
options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[2], hmm_model.transmat_[2]) )
options.info( f"# HMM Emissions (mean): ")
options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format( ["short", "mono", "di", "tri"] ) )
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.means_[0]) )
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.means_[1]) )
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.means_[2]) )

if options.hmm_type == 'gaussian':
options.info( f"# HMM Emissions (means): ")
options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format( ["short", "mono", "di", "tri"] ) )
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.means_[0]) )
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.means_[1]) )
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.means_[2]) )
if options.hmm_type == 'poisson':
options.info( f"# HMM Emissions (lambdas): ")
options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format( ["short", "mono", "di", "tri"] ) )
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.lambdas_[0]) )
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.lambdas_[1]) )
options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.lambdas_[2]) )


#############################################
Expand Down Expand Up @@ -386,7 +398,7 @@ def run( args ):
n += 1
cr = candidate_regions.pop( options.decoding_steps )
options.info( "# decoding %d..." % ( n*options.decoding_steps ) )
[ cr_bins, cr_data, cr_data_lengths ] = extract_signals_from_regions( digested_atac_signals, cr, binsize = options.hmm_binsize )
[ cr_bins, cr_data, cr_data_lengths ] = extract_signals_from_regions( digested_atac_signals, cr, binsize = options.hmm_binsize, hmm_type = options.hmm_type )
#options.info( "# extracted signals in the candidate regions")
candidate_bins.extend( cr_bins )
#options.info( "# saving information regarding the candidate regions")
Expand Down
73 changes: 47 additions & 26 deletions MACS3/Signal/HMMR_HMM.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import numpy as np
cimport numpy as np
from cpython cimport bool
import hmmlearn
from hmmlearn.hmm import GaussianHMM
from hmmlearn.hmm import GaussianHMM, PoissonHMM
from sklearn import cluster
import json
# from hmmlearn cimport hmm
Expand Down Expand Up @@ -57,13 +57,16 @@ cdef inline float get_weighted_density( int x, float m, float v, w ):
# ------------------------------------


cpdef hmm_training( list training_data, list training_data_lengths, int n_states = 3, int random_seed = 12345, covar = 'full' ):
cpdef hmm_training( list training_data, list training_data_lengths, int n_states = 3, int random_seed = 12345, covar = 'full', hmm_type = 'gaussian' ):
# training data should be in array like format: X = np.array([[.5, .3, .1, .1], [.6, .4, 0, 0]])
# if we do not want init_prob to be updated through learning, set params = 'tmc' and init_prob = initial_state otherwise it will be overwritten
# according to base documentation, if init_prob not stated, it is set to be equally likely for any state (1/ # of components)
# if we have other known parameters, we should set these (ie: means_weights, covariance_type etc.)
rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(random_seed)))
hmm_model = GaussianHMM( n_components= n_states, covariance_type = covar, random_state = rs, verbose = False )
if hmm_type == "gaussian":
hmm_model = GaussianHMM( n_components= n_states, covariance_type = covar, random_state = rs, verbose = False )
if hmm_type == "poisson":
hmm_model = PoissonHMM( n_components= n_states, random_state = rs, verbose = False )
hmm_model = hmm_model.fit( training_data, training_data_lengths )
assert hmm_model.n_features == 4
return hmm_model
Expand All @@ -72,33 +75,51 @@ cpdef hmm_predict( list signals, list lens, hmm_model ):
predictions = hmm_model.predict_proba( signals, lens )
return predictions

cpdef void hmm_model_save( str model_file, object hmm_model, int hmm_binsize, int i_open_region, int i_nucleosomal_region, int i_background_region ):
if hmm_model.covariance_type == "diag":
covars = hmm_model.covars_.diagonal(axis1=1, axis2=2)
elif hmm_model.covariance_type == "full":
covars = hmm_model.covars_
else:
raise Exception(f"Unknown covariance type {hmm_model.covariance_type}")
with open( model_file, "w" ) as f:
json.dump( {"startprob":hmm_model.startprob_.tolist(),
"transmat":hmm_model.transmat_.tolist(),
"means":hmm_model.means_.tolist(),
"covars":covars.tolist(),
"covariance_type":hmm_model.covariance_type,
"n_features":int(hmm_model.n_features),
"i_open_region":int(i_open_region),
"i_background_region":int(i_background_region),
"i_nucleosomal_region":int(i_nucleosomal_region),
"hmm_binsize":int(hmm_binsize)}, f )
cpdef void hmm_model_save( str model_file, object hmm_model, int hmm_binsize, int i_open_region, int i_nucleosomal_region, int i_background_region, hmm_type ):
if hmm_type == "gaussian":
if hmm_model.covariance_type == "diag":
covars = hmm_model.covars_.diagonal(axis1=1, axis2=2)
elif hmm_model.covariance_type == "full":
covars = hmm_model.covars_
else:
raise Exception(f"Unknown covariance type {hmm_model.covariance_type}")
with open( model_file, "w" ) as f:
json.dump( {"startprob":hmm_model.startprob_.tolist(),
"transmat":hmm_model.transmat_.tolist(),
"means":hmm_model.means_.tolist(),
"covars":covars.tolist(),
"covariance_type":hmm_model.covariance_type,
"n_features":int(hmm_model.n_features),
"i_open_region":int(i_open_region),
"i_background_region":int(i_background_region),
"i_nucleosomal_region":int(i_nucleosomal_region),
"hmm_binsize":int(hmm_binsize),
"hmm_type":hmm_type}, f )
if hmm_type == "poisson":
with open( model_file, "w" ) as f:
json.dump( {"startprob":hmm_model.startprob_.tolist(),
"transmat":hmm_model.transmat_.tolist(),
"lambdas":hmm_model.lambdas_.tolist(),
"n_features":int(hmm_model.n_features),
"i_open_region":int(i_open_region),
"i_background_region":int(i_background_region),
"i_nucleosomal_region":int(i_nucleosomal_region),
"hmm_binsize":int(hmm_binsize),
"hmm_type":hmm_type}, f )

cpdef list hmm_model_init( str model_file ):
with open( model_file ) as f:
m = json.load( f )
hmm_model = GaussianHMM( n_components=3, covariance_type=m["covariance_type"] )
hmm_type = m.get("hmm_type", "gaussian") #if no model_type written, assume older file and use gaussian
if hmm_type == "gaussian":
hmm_model = GaussianHMM( n_components=3, covariance_type=m["covariance_type"] )
hmm_model.means_ = np.array(m["means"])
hmm_model.covars_ = np.array(m["covars"])
hmm_model.covariance_type = m["covariance_type"]
if hmm_type == "poisson":
hmm_model = PoissonHMM( n_components=3 )
hmm_model.lambdas_ = np.array(m["lambdas"])
hmm_model.startprob_ = np.array(m["startprob"])
hmm_model.transmat_ = np.array(m["transmat"])
hmm_model.means_ = np.array(m["means"])
hmm_model.covars_ = np.array(m["covars"])
hmm_model.covariance_type = m["covariance_type"]
hmm_model.n_features = m["n_features"]
return [ hmm_model, m["i_open_region"], m["i_background_region"], m["i_nucleosomal_region"], m["hmm_binsize"] ]
return [ hmm_model, m["i_open_region"], m["i_background_region"], m["i_nucleosomal_region"], m["hmm_binsize"], hmm_type ]
44 changes: 30 additions & 14 deletions MACS3/Signal/HMMR_Signal_Processing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ cpdef list generate_digested_signals( object petrack, list weight_mapping ):
ret_bedgraphs.append( bdg )
return ret_bedgraphs

cpdef list extract_signals_from_regions( list signals, object regions, int binsize = 10 ):
cpdef list extract_signals_from_regions( list signals, object regions, int binsize = 10, hmm_type = 'gaussian' ):
# we will take regions in peaks, create a bedGraphTrackI with
# binned regions in peaks, then let them overlap with signals to
# create a list (4) of value arrays.
Expand Down Expand Up @@ -168,19 +168,35 @@ cpdef list extract_signals_from_regions( list signals, object regions, int binsi
counter = 0
prev_c = extracted_len[0][0]
c = 0
for i in range( nn ):
ret_training_bins.append( extracted_positions[0][i] )
ret_training_data.append(
[ max( 0.0001, extracted_data[0][i] ),
max( 0.0001, extracted_data[1][i] ),
max( 0.0001, extracted_data[2][i] ),
max( 0.0001, extracted_data[3][i] ) ] )
c = extracted_len[0][i]
if counter != 0 and c != prev_c:
ret_training_lengths.append( counter )
counter = 0
prev_c = c
counter += 1
if hmm_type == "gaussian":
for i in range( nn ):
ret_training_bins.append( extracted_positions[0][i] )
ret_training_data.append(
[max( 0.0001, extracted_data[0][i] ),
max( 0.0001, extracted_data[1][i] ),
max( 0.0001, extracted_data[2][i] ),
max( 0.0001, extracted_data[3][i] ) ] )
c = extracted_len[0][i]
if counter != 0 and c != prev_c:
ret_training_lengths.append( counter )
counter = 0
prev_c = c
counter += 1
#poisson can only take int values as input
if hmm_type == "poisson":
for i in range( nn ):
ret_training_bins.append( extracted_positions[0][i] )
ret_training_data.append(
[ int(max( 0.0001, extracted_data[0][i] )),
int(max( 0.0001, extracted_data[1][i] )),
int(max( 0.0001, extracted_data[2][i] )),
int(max( 0.0001, extracted_data[3][i] )) ] )
c = extracted_len[0][i]
if counter != 0 and c != prev_c:
ret_training_lengths.append( counter )
counter = 0
prev_c = c
counter += 1
# last region
ret_training_lengths.append( counter )
assert sum(ret_training_lengths) == len(ret_training_data)
Expand Down
4 changes: 4 additions & 0 deletions MACS3/Utilities/OptValidator.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,10 @@ def opt_validate_hmmratac ( options ):
if options.hmm_modelonly:
options.argtxt += "# Program will stop after generating model, which can be later applied with '--model'. \n"

# hmm_modelType
if options.hmm_type:
options.argtxt += "# Use --hmm-type to select a Gaussian ('gaussian') or Poisson ('poisson') model for the hidden markov model in HMMRATAC. Default: 'gaussian'. \n"


# Peak Calling
if options.prescan_cutoff <= 1:
Expand Down
2 changes: 2 additions & 0 deletions bin/macs3
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,8 @@ plus an extra option for the HMM model file like `macs3 hmmratac
help = "A JSON file generated from previous HMMRATAC run to use instead of creating new one. When provided, HMM training will be skipped. Default: NA" )
group_hmm.add_argument( "--modelonly", dest = "hmm_modelonly", action = "store_true", default = False,
help = "Stop the program after generating model. Use this option to generate HMM model ONLY, which can be later applied with `--model`. Default: False")
group_hmm.add_argument( "--hmm-type", dest = "hmm_type", type = str, choices = ("gaussian", "poisson"), default = "gaussian",
help = "Use --hmm-type to select a Gaussian ('gaussian') or Poisson ('poisson') model for the hidden markov model in HMMRATAC. Default: 'gaussian'.")

# group for peak calling arguments
group_call = argparser_hmmratac.add_argument_group( "Peak calling/HMM decoding arguments" )
Expand Down
15 changes: 13 additions & 2 deletions test/cmdlinetest
Original file line number Diff line number Diff line change
Expand Up @@ -213,18 +213,29 @@ macs3 randsample -i $CHIPCONTIGS50K -n 100000 --seed 31415926 --outdir ${OUTPUTD
echo "15. hmmratac"
mkdir ${OUTPUTDIR_PREFIX}_run_hmmratac

macs3 hmmratac -i $ATACSEQBAM -n hmmratac_yeast500k --save-training-data --outdir ${OUTPUTDIR_PREFIX}_run_hmmratac &> ${OUTPUTDIR_PREFIX}_run_hmmratac/run_hmmratac.log
# check default (no hmm-type specified) this is technically the same as --hmm-type gaussian (can include if preferred)
# macs3 hmmratac -i $ATACSEQBAM -n hmmratac_yeast500k --outdir ${OUTPUTDIR_PREFIX}_run_hmmratac &> ${OUTPUTDIR_PREFIX}_run_hmmratac/run_hmmratac.log

# check both gaussian and poisson hmm models, save training data
macs3 hmmratac -i $ATACSEQBAM --hmm-type gaussian -n hmmratac_yeast500k --save-training-data --outdir ${OUTPUTDIR_PREFIX}_run_hmmratac &> ${OUTPUTDIR_PREFIX}_run_hmmratac/run_hmmratac.log
macs3 hmmratac -i $ATACSEQBAM --hmm-type poisson -n hmmratac_yeast500k_poisson --save-training-data --outdir ${OUTPUTDIR_PREFIX}_run_hmmratac &> ${OUTPUTDIR_PREFIX}_run_hmmratac/run_hmmratac_poisson.log

# check bedpe for both gaussian, poisson
macs3 hmmratac -i $ATACSEQBED -n hmmratac_yeast500k_bedpe -f BEDPE --outdir ${OUTPUTDIR_PREFIX}_run_hmmratac &> ${OUTPUTDIR_PREFIX}_run_hmmratac/run_hmmratac_bedpe.log
macs3 hmmratac -i $ATACSEQBED --hmm-type poisson -n hmmratac_yeast500k_bedpe_poisson -f BEDPE --outdir ${OUTPUTDIR_PREFIX}_run_hmmratac &> ${OUTPUTDIR_PREFIX}_run_hmmratac/run_hmmratac_bedpe_poisson.log

TRAINING_REGIONS_BED=${OUTPUTDIR_PREFIX}_run_hmmratac/hmmratac_yeast500k_training_regions.bed
HMM_MODEL=${OUTPUTDIR_PREFIX}_run_hmmratac/hmmratac_yeast500k_model.json
TRAINING_REGIONS_BED_POISSON=${OUTPUTDIR_PREFIX}_run_hmmratac/hmmratac_yeast500k_poisson_training_regions.bed
HMM_MODEL_POISSON=${OUTPUTDIR_PREFIX}_run_hmmratac/hmmratac_yeast500k_poisson_model.json

echo "15.1 hmmratac load training regions from bedfile"
macs3 hmmratac -i $ATACSEQBAM -n hmmratac_yeast500k_load_training_regions -t ${TRAINING_REGIONS_BED} --outdir ${OUTPUTDIR_PREFIX}_run_hmmratac &> ${OUTPUTDIR_PREFIX}_run_hmmratac/run_hmmratac_load_training_regions.log
macs3 hmmratac -i $ATACSEQBAM --hmm-type poisson -n hmmratac_yeast500k_poisson_load_training_regions -t ${TRAINING_REGIONS_BED_POISSON} --outdir ${OUTPUTDIR_PREFIX}_run_hmmratac &> ${OUTPUTDIR_PREFIX}_run_hmmratac/run_hmmratac_load_training_regions_poisson.log

echo "15.2 hmmratac load hmm model file"
# echo "15.2 hmmratac load hmm model file"
macs3 hmmratac -i $ATACSEQBAM -n hmmratac_yeast500k_load_hmm_model --model ${HMM_MODEL} --outdir ${OUTPUTDIR_PREFIX}_run_hmmratac &> ${OUTPUTDIR_PREFIX}_run_hmmratac/run_hmmratac_load_hmm_model.log
macs3 hmmratac -i $ATACSEQBAM --hmm-type poisson -n hmmratac_yeast500k_poisson_load_hmm_model --model ${HMM_MODEL_POISSON} --outdir ${OUTPUTDIR_PREFIX}_run_hmmratac &> ${OUTPUTDIR_PREFIX}_run_hmmratac/run_hmmratac_load_hmm_model_poisson.log

echo "16. search for errors or warnings in log files"
flag=0
Expand Down
Loading

0 comments on commit 880ab61

Please sign in to comment.