You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
importnumpyasnpfromnumpyimportrandomimportosimportpandasaspdfromtqdmimporttqdm#from src.config import ROOT_DIRfromsklearn.metricsimportroc_curve, average_precision_score, confusion_matrix, f1_scoreimportmatplotlib.pyplotaspltimportpickleimportseabornassnsimportglobfromsklearn.clusterimportKMeansfromsklearnimportmetricsfromscipy.spatial.distanceimportcdistfrompandarallelimportpandarallelfrommplh.color_utilsimportget_colorsfrommplh.fig_utilsimportlegend_from_colorfrommplhimportcluster_helpaschfromsrc.simulations.utils.configimportread_config_file, write_config_filefromdynamicTreeCutimportcutreeHybridfromscipy.spatial.distanceimportpdistfromscipy.cluster.hierarchyimportlinkagefromsklearn.model_selectionimportParameterGridfromsrc.simulations.utils.configimportcheck_requiredfromsrc.simulations.analysisimportAnalysisasanfrom .simulationimportSimulationfromsrc.utils.utilsimportcompare_arbitrary_labels# Does this ruin running the MCMC? I don't think so, b/c that format is going to be put in after anywayclassFullSimulation:
""" Class that simulates cell growth for lineage tracing. Reads in a parameter file and runs a certain number of iterations based on the num_iterations parameter. :ivar n_iter: Number of iterations :type n_iter: int :ivar num_cells: Number of cells to sequence :type num_cells: int :ivar sim: Each index is a different iteration of the simulation. :type sim: pandas Series """def__init__(self, params_f):
params=read_config_file(params_f)
self.n_iter=params['num_iterations']
self.num_cells=params['num_cells']
self.params=paramsif'n_clust'notinparams:
self.params['n_clust'] =None# Store the metrics with thisself.metrics=dict()
# Files to save#self.outdir = os.path.join(self.params['local_outdir'])self.outdir=self.params['local_outdir' ]
self.data_outdir=os.path.join(self.params['data_outdir'])
self.f_save_data=os.path.join(self.data_outdir,
self.params['name'] +'.p')
self.f_save=os.path.join(self.outdir, self.params['name'] +'.p')
self.f_save_metrics=self.f_save_data.replace('.p', '.metrics.tsv')
self.f_save_cluster=self.f_save_data.replace('.p', '.cluster.tsv')
self.f_save_befaft=self.f_save_data.replace('.p', '.before_after.tsv')
self.f_save_befaft_cl=self.f_save_data.replace('.p',
'.before_after_cl.tsv')
self.f_save_rocs=self.f_save_data.replace('.p', '.rocs.p')
return#for i in self.n_iter:defrun(self):
""" Runs the simulation and stores it in sim attr. Will also pickle the objects and save. This uses Pandaralel to parallelize the runs. :return: """# Parallelize dfdf=pd.Series(index=range(self.n_iter))
df=df.apply(self.run_sim, args=(self.params,))
#pandarallel.initialize(nb_workers=self.params['cpus'])#df = df.parallel_apply(self.run_sim, args=(self.params,))self.sim=dfif"save_small"inself.paramsandself.params["save_small"] >0:
curr=self.sim[:self.params["save_small"]]
forind, valincurr.items():
curr_f=self.f_save.replace(".p","") +"_sim"+str(ind)
val.to_csv(curr_f+".csv")
return@staticmethoddefrun_sim(x, params):
"""Run iteration of simulation. For a single iteration, it will initialize, grow, subsample, and merge the before stimulus and after stimulus variables. It willl also run :param x: Placeholder variable :param params: The parameter dictionary to use :type params: dict :return: """s=Simulation(params)
s.initialize()
s.grow()
s.subsample_new(to_delete=True)
s.combine_init_growth()
returnsdefrun_metrics(self):
""" Get metrics performances and save. :return: """self.sim_performance_dominant(group='both')
self.stats_before_after()
self.cluster_befaft()
self.stats_cluster_befaft_dom()
self.estimate_growth_rates_from_cluster()
self.stats_before_after_clust()
self.kl_divergence()
defflatten_sim(self):
## TODO# This will extract out the classes of dfreturndefsim_performance_dominant(self, group='both'):
""" Will colect metrics that are averaged over the simulations. These are specifically for looking at the main, dominant clone, and what the allele-frequency of that clone variant is for each cell. :param group: {'init', 'growth', 'both'} This will indicate to group by :ivar dropout: Number of dominant clone cells that have 0 reads at the lineage variant position. :type dropout: list :ivar prec_scores: sklearn average precision score based on the allele frequencies seen in the dominant clone cells versus the non-clone cells. :type prec_scores: list :ivar rocs: ROC curves for each iteration based on allele frequencies. :return: """dropout= []
rocs= []
prec_scores= []
foriter, sinenumerate(self.sim.values):
# First get the dominant clone , which is indexed as 1mt_pos=s.clone_mt_dict[1]
# TODO account for mt_pos being a list not an intifgroup=='init':
clones=s.clone_cellcell_af=s.cell_af.loc[:,mt_pos]
elifgroup=='growth':
clones=s.new_clone_cellcell_af=s.new_cell_af.loc[:,mt_pos]
elifgroup=='both':
#clones = pd.concat((s.clone_cell, s.subsample_new_clone_cell)).reset_index(drop=True)#cell_af = pd.concat((s.cell_af.loc[:,mt_pos], s.subsample_new_cell_af.loc[:,mt_pos])).reset_index(drop=True)clones=s.combined_clonescell_af=s.combined_cell_af.loc[:,mt_pos]
else:
raiseValueError('group variable not properly set.')
y_true=clones.values.copy()
y_true[y_true!=1] =0# Set nondominant clones to 0rocs.append(roc_curve(y_true, cell_af))
prec_scores.append(average_precision_score(y_true, cell_af))
dropout.append((cell_af[clones==1]==0).sum()/cell_af[clones==1].shape[0])
self.dropout=dropoutself.prec_scores=prec_scoresself.rocs=rocspd.DataFrame([prec_scores, dropout], index=['Precision', 'Dropout']).to_csv(self.f_save_metrics, sep='\t')
self.metrics['prec_scores'] =prec_scoresself.metrics['dropout'] =dropoutself.metrics['rocs'] =rocspickle.dump(rocs, open(self.f_save_rocs, 'wb'))
returndefreduce_cells(self, cell_af):
#self.simreturndefstats_before_after(self, clone_id=1):
b_a_df=pd.DataFrame(index=np.arange(0,len(self.sim)), columns=["B", "A", "A/B"], dtype=str)
foriter, sinenumerate(self.sim.values):
b_clones=s.clone_cella_clones=s.subsample_new_clone_cellb_a_df.at[iter, "B"] = (b_clones==clone_id).sum()
b_a_df.at[iter, "A"] = (a_clones==clone_id).sum()
b_a_df.at[iter,"A/B"] = (b_a_df.at[iter, "A"]+1)/(b_a_df.at[iter, "B"]+1)
self.b_a_df=b_a_dfb_a_df.to_csv(self.f_save_befaft, sep='\t')
self.metrics['b_a_df'] =b_a_dfreturndefcluster_befaft(self):
""" Loops through the simulations and for each, it clusters the cells. :ivar cluster_results: Cluster labels for each cell in each iteration. :type List of tuples, which is a list of a tuple, where the tuple is indexed by the cell and the value is the cell's cluster label """cluster_results= []
forsintqdm(self.sim.values):
cluster_results.append(an.cluster_kmeans(s.combined_cell_af,
n_clust=self.params['n_clust']))
# Add the cluster results to combined metas.combined_meta["cluster"] =cluster_results[-1]
# Bring the cluster labels and the clone labels into same# name spaces.combined_meta['cluster_clone'] =compare_arbitrary_labels(s.combined_meta['clone'],
s.combined_meta['cluster'])
self.cluster_results=cluster_resultsdefstats_cluster_befaft_dom(self, clone_id=1):
""" Confusion matrix for clustering the proper clone cells together. :param clone_id: Which clone to get metrics for :return: """b_a_df=pd.DataFrame(index=np.arange(len(self.sim)),
columns=["TN", "FP", "FN", "TP"], dtype=int)
f1_vals= []
forind, sinenumerate(self.sim.values):
y_true=s.combined_clonesy_true[y_true!=clone_id] =0y_pred=s.combined_meta['cluster_clone'].copy()
y_pred[y_pred!=clone_id] =0f1_vals.append(f1_score(y_true, y_pred))
# y_true, y_predtn, fp, fn, tp=confusion_matrix(y_true, y_pred).ravel()
b_a_df.loc[ind] = [tn, fp, fn, tp]
self.b_a_df_clust=b_a_dfself.metrics['Dominant Cluster Confusion'] =b_a_dfself.metrics['Cluster F1 scores'] =f1_valsreturndefestimate_growth_rates_from_known(self):
all_growth_estimate= []
all_clone_sizes= []
foriter, sinenumerate(self.sim.values):
growth_estimate, clone_sizes, _, _=an.estimate_growth_rate(s.combined_meta)
all_growth_estimate.append(growth_estimate)
all_clone_sizes.append(clone_sizes)
self.metrics['obs_growth_rates'] =all_growth_estimateself.metrics['obs_clone_sizes'] =all_clone_sizesreturndefestimate_growth_rates_from_cluster(self):
all_growth_estimate= []
all_clone_sizes= []
all_bef_est= []
all_aft_est= []
foriter, sinenumerate(self.sim.values):
growth_estimate, clone_sizes, aft_est, bef_est=an.estimate_growth_rate(s.combined_meta, clone_col="cluster_clone")
all_growth_estimate.append(growth_estimate)
all_bef_est.append(bef_est)
all_aft_est.append(aft_est)
all_clone_sizes.append(clone_sizes)
self.metrics['pred_growth_rates'] =all_growth_estimateself.metrics['pred_aft_count'] =all_aft_estself.metrics['pred_bef_count'] =all_bef_estself.metrics['pred_clone_sizes'] =all_clone_sizesreturndefstats_before_after_clust(self, clone_id=1):
b_a_df=pd.DataFrame(index=np.arange(0,len(self.sim)), columns=["A/B", "A", "B"], dtype=str)
foriter, sinenumerate(self.sim.values):
curr_pred_growth=self.metrics['pred_growth_rates'][iter]
curr_pred_aft=self.metrics['pred_aft_count'][iter]
#print('curr_pred_aft', curr_pred_aft)curr_pred_bef=self.metrics['pred_bef_count'][iter]
b_a_df.at[iter, "A/B"] =curr_pred_growth.loc[clone_id]
b_a_df.at[iter, "A"] =curr_pred_aft.loc[clone_id]
b_a_df.at[iter, "B"] =curr_pred_bef.loc[clone_id]
b_a_df.to_csv(self.f_save_befaft_cl, sep='\t')
self.metrics['b_a_clust_df'] =b_a_dfreturndefkl_divergence(self):
all_growth_estimate= []
all_clone_sizes= []
foriter, sinenumerate(self.sim.values):
growth_estimate, clone_sizes, _, _=an.estimate_growth_rate(s.combined_meta)
all_growth_estimate.append(growth_estimate)
all_clone_sizes.append(clone_sizes)
self.metrics['pred_growth_rates'] =all_growth_estimateself.metrics['pred_clone_sizes'] =all_clone_sizesdefsave(self, f_save=None):
iff_saveisNone:
f_save=self.f_save_dataf=open(f_save, 'wb')
pickle.dump(self.__dict__, f, 2)
f.close()
defload(self, f_save=None):
#filename = self.params['filename']iff_saveisNone:
f_save=self.f_savef=open(f_save, 'rb')
tmp_dict=pickle.load(f)
f.close()
self.__dict__.update(tmp_dict)
defmain():
returnif"__name__"=="__main__":
main()
Nonewlineatendoffileewfilemode100644ndex0000000..56d0507++b/src/simulations/parametersweep.py
00ec948784467058e080c2d5373f08ae6eaeb83c
The text was updated successfully, but these errors were encountered:
account for mt_pos being a list not an int
Mito_Trace/src/simulations/fullsimulation.py
Line 164 in ed20573
00ec948784467058e080c2d5373f08ae6eaeb83c
The text was updated successfully, but these errors were encountered: