Skip to content

Commit

Permalink
Fix Issue #23 by refactoring code to save fullsimulation and only sto…
Browse files Browse the repository at this point in the history
…ring metrics in parametersweep
  • Loading branch information
Isaac Shamie committed Oct 27, 2020
1 parent 9529088 commit 657ba0d
Show file tree
Hide file tree
Showing 13 changed files with 386 additions and 110 deletions.
58 changes: 46 additions & 12 deletions src/simulations/fullsimulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,36 +50,53 @@ def __init__(self, params_f):
self.n_iter = params['num_iterations']
self.num_cells = params['num_cells']
self.params = params
self.f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p')

# Store the metrics with this
self.metrics = dict()

# Files to save
self.outdir = os.path.join(self.params['local_outdir'])
self.data_outdir = os.path.join(self.params['data_outdir'])
self.f_save_data = os.path.join(self.data_outdir,
self.params['name'] + '.p')
self.f_save = os.path.join(self.outdir, self.params['name'] + '.p')
self.f_save_metrics = self.f_save.replace('.p', '.metrics.tsv')
self.f_save_cluster = self.f_save.replace('.p', '.cluster.tsv')
self.f_save_befaft = self.f_save.replace('.p', '.before_after.tsv')
self.f_save_rocs = self.f_save.replace('.p', '.rocs.p')


return
#for i in self.n_iter:

def run(self):
"""
Runs the simulation and stores it in sim attr.
Runs the simulation and stores it in sim attr. Will also pickle
the objects and save.
This uses Pandaralel to parallelize the runs.
:return:
"""
# Parallelize df
df = pd.Series(index=range(self.n_iter))
#df = df.apply(self.run_sim, args=(self.params,))
pandarallel.initialize(nb_workers=self.params['cpus'])
df = df.parallel_apply(self.run_sim, args=(self.params,))
df = df.apply(self.run_sim, args=(self.params,))

#pandarallel.initialize(nb_workers=self.params['cpus'])
#df = df.parallel_apply(self.run_sim, args=(self.params,))

self.sim = df
#self.cluster_before_after()
self.sim_performance_dominant(group='both')
self.stats_before_after()
return

@staticmethod
def run_sim(x, params):
"""
For a simulation, it will initialize, grow, subsample,
"""Run iteration of simulation.
For a single iteration, it will initialize, grow, subsample,
and merge the before stimulus and after stimulus variables.
It willl also run
:param x: Placeholder variable
:param params: The parameter dictionary to use
:type params: dict
:return:
"""
s = Simulation(params)
Expand All @@ -89,6 +106,16 @@ def run_sim(x, params):
s.combine_init_growth()
return s

def run_metrics(self):
"""
Get metrics performances and save.
:return:
"""
self.sim_performance_dominant(group='both')
self.stats_before_after()
# self.cluster_before_after()


def flatten_sim(self):
## TODO
# This will extract out the classes of df
Expand Down Expand Up @@ -146,6 +173,12 @@ def sim_performance_dominant(self, group='both'):
self.dropout = dropout
self.prec_scores = prec_scores
self.rocs = rocs
pd.DataFrame([prec_scores, dropout], index=['Precision', 'Dropout']).to_csv(self.f_save_metrics, sep='\t')
self.metrics['prec_scores'] = prec_scores
self.metrics['dropout'] = dropout
self.metrics['rocs'] = rocs
pickle.dump(rocs, open(self.f_save_rocs, 'wb'))

return


Expand All @@ -163,6 +196,8 @@ def stats_before_after(self, clone_id=1):
b_a_df.at[iter, "After"] = (a_clones==clone_id).sum()
b_a_df.at[iter,"A/B"] = (b_a_df.at[iter, "After"]/b_a_df.at[iter, "Before"])
self.b_a_df = b_a_df
b_a_df.to_csv(self.f_save_befaft, sep='\t')
self.metrics['b_a_df'] = b_a_df
return


Expand Down Expand Up @@ -207,10 +242,9 @@ def stats_cluster_before_after(self, clone_id=1):
return



def save(self, f_save=None):
if f_save is None:
f_save = self.f_save
f_save = self.f_save_data
f = open(f_save, 'wb')
pickle.dump(self.__dict__, f, 2)
f.close()
Expand Down
81 changes: 56 additions & 25 deletions src/simulations/parametersweep.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,29 +60,37 @@ def __init__(self, default_params_f, sweep_params_f):
:type str
"""
self.sweep_params_f = sweep_params_f
self.default_params_f = default_params_f
params = read_config_file(default_params_f)
sweep_params = read_config_file(sweep_params_f)
self.sweep_params_f = sweep_params_f
self.default_params_f = default_params_f
self.sweep_params = sweep_params
self.default_params = params

# Create the yaml files in the directory indicated by local_outdir and prefix in sweep_params_f
self.outdir = os.path.join(sweep_params["outdir"], sweep_params["prefix"])
if not os.path.exists(self.outdir):
os.makedirs(self.outdir)

self.f_save = \
os.path.join(self.outdir,self.sweep_params['prefix'] + '.p')
self.tmp_f_save = self.f_save.replace('.p', '') + '_tmp.p'

self.data_outdir = os.path.join(sweep_params['data_outdir'], sweep_params["prefix"])
if not os.path.exists(self.data_outdir):
os.makedirs(self.data_outdir)

# Create a grid.
# Loop through and set each parameter in the the params grid parameter.
self.params_df = pd.DataFrame(list(ParameterGrid(sweep_params['grid'])))
params_dict = dict()

for ind, val in self.params_df.iterrows():
f_name = os.path.join(self.outdir, str(ind)+'.yaml' )
# Create the name
params['name'] = str(ind)
params['data_outdir'] = self.data_outdir
params['local_outdir'] = self.sweep_params["outdir"]
params['prefix'] = self.sweep_params["prefix"]
for name, v in val.iteritems():
# Set the specific variables that need to be updated
if name == 'dominant_clone_sizes':
Expand All @@ -101,33 +109,54 @@ def __init__(self, default_params_f, sweep_params_f):
params_dict[f_name] = params
return


@staticmethod
def run_single_sweep(f, outdir):
print(f'Running file {f}')
params_f = os.path.join(outdir, str(f) + '.yaml')
sim = FullSimulation(params_f)
sim.run()
sim.run_metrics()
sim.save()
return sim.metrics

def run_sweep(self, subset=None):
"""
Loops through self.params_df and runs the simulation on that
parameter.
""" Loops through params_df and runs the simulation on that parameter.
Each run will be saved, and not stored. However, the metrics
will be stored here.
:param subset: Which files to run the simulation. If list,
list of files to run on, if int, number of files to randomly
choose. If None, run on all.
:type subset: int or None or list (default=None)
"""

params_df = self.params_df
if isinstance(subset, int):
params_df = params_df.sample(n=subset)

sweep_results = dict()
for f, val in params_df.iterrows():
params_f = os.path.join(self.outdir, str(f) +'.yaml')
print(f"Running with file: {f}")
sim = FullSimulation(params_f)
sim.run()
sweep_results[f] = sim

# Save temp file for last track.
self.sweep_results = sweep_results
self.save(f_save=self.tmp_f_save)
sweep_results_df = pd.Series(index=params_df.index, data=params_df.index)
###
#sweep_results_df = sweep_results_df.apply(self.run_single_sweep, args=(self.outdir,))
pandarallel.initialize(nb_workers=self.sweep_params['cpus'])
sweep_results_df = sweep_results_df.parallel_apply(self.run_single_sweep, args=(self.outdir,))
###

self.sweep_results = sweep_results_df

# sweep_results = dict()
#
# for f, val in params_df.iterrows():
# params_f = os.path.join(self.outdir, str(f) +'.yaml')
# print(f"Running with file: {f}")
# sim = FullSimulation(params_f)
# sim.run()
# sim.run_metrics()
# sim.save()
#
# # Only store the current metrics, not the entire simulation.
# sweep_results[f] = sim.metrics
# #self.save(f_save=self.tmp_f_save)
# self.sweep_results = sweep_results
return


Expand All @@ -149,15 +178,17 @@ def plot_sensitivity_and_dropout(self):

# Add these results to self.results, which has the meta information too
for ind, val in self.params_df.iterrows():
full_sim = self.sweep_results[ind]
dropout = full_sim.dropout
prec_scores = full_sim.prec_scores
rocs = full_sim.rocs
curr_results = self.sweep_results[ind]
dropout = curr_results['dropout']
prec_scores = curr_results['prec_scores']
rocs = curr_results['rocs']

metrics.at[ind, 'Avg. Precision'] = np.mean(prec_scores)
metrics.at[ind, '% dropout'] = np.mean(dropout)

self.metrics = metrics
"""df that contains simulation performance metric.
"""
# Seaborn Factorplot
g = sns.FacetGrid(data=metrics, col="het_err_rate", hue="cov_constant")
g.map(sns.scatterplot, "dominant_het", "Avg. Precision")
Expand Down Expand Up @@ -193,8 +224,8 @@ def plot_before_after_all_clones(self):
"""
df_full = []
for ind, val in self.params_df.iterrows():
full_sim = self.sweep_results[ind]
b_a_df = full_sim.b_a_df
curr_results = self.sweep_results[ind]
b_a_df = curr_results['b_a_df']
# Each element will be a list of the values.
s = self.params_df.loc[ind].copy()
curr = pd.DataFrame([s.copy()] * len(b_a_df))
Expand Down
Loading

0 comments on commit 657ba0d

Please sign in to comment.