Fix Issue #23 by refactoring code to save fullsimulation and only sto…

…ring metrics in parametersweep
LewisLabUCSD · Oct 27, 2020 · 657ba0d · 657ba0d
1 parent 9529088
commit 657ba0d
Show file tree

Hide file tree

Showing 13 changed files with 386 additions and 110 deletions.
diff --git a/src/simulations/fullsimulation.py b/src/simulations/fullsimulation.py
@@ -50,36 +50,53 @@ def __init__(self, params_f):
         self.n_iter = params['num_iterations']
         self.num_cells = params['num_cells']
         self.params = params
-        self.f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p')
+
+        # Store the metrics with this
+        self.metrics = dict()
+
+        # Files to save
+        self.outdir = os.path.join(self.params['local_outdir'])
+        self.data_outdir = os.path.join(self.params['data_outdir'])
+        self.f_save_data = os.path.join(self.data_outdir,
+                                   self.params['name'] + '.p')
+        self.f_save = os.path.join(self.outdir, self.params['name'] + '.p')
+        self.f_save_metrics = self.f_save.replace('.p', '.metrics.tsv')
+        self.f_save_cluster = self.f_save.replace('.p', '.cluster.tsv')
+        self.f_save_befaft = self.f_save.replace('.p', '.before_after.tsv')
+        self.f_save_rocs = self.f_save.replace('.p', '.rocs.p')
+
+
         return
         #for i in self.n_iter:
 
     def run(self):
         """
-        Runs the simulation and stores it in sim attr.
+        Runs the simulation and stores it in sim attr. Will also pickle
+        the objects and save.
 
         This uses Pandaralel to parallelize the runs.
         :return:
         """
         # Parallelize df
         df = pd.Series(index=range(self.n_iter))
-        #df = df.apply(self.run_sim, args=(self.params,))
-        pandarallel.initialize(nb_workers=self.params['cpus'])
-        df = df.parallel_apply(self.run_sim, args=(self.params,))
+        df = df.apply(self.run_sim, args=(self.params,))
+
+        #pandarallel.initialize(nb_workers=self.params['cpus'])
+        #df = df.parallel_apply(self.run_sim, args=(self.params,))
 
         self.sim = df
-        #self.cluster_before_after()
-        self.sim_performance_dominant(group='both')
-        self.stats_before_after()
         return
 
     @staticmethod
     def run_sim(x, params):
-        """
-        For a simulation, it will initialize, grow, subsample,
+        """Run iteration of simulation.
+
+        For a single iteration, it will initialize, grow, subsample,
         and merge the before stimulus and after stimulus variables.
+        It willl also run
         :param x: Placeholder variable
         :param params: The parameter dictionary to use
+        :type params: dict
         :return:
         """
         s = Simulation(params)
@@ -89,6 +106,16 @@ def run_sim(x, params):
         s.combine_init_growth()
         return s
 
+    def run_metrics(self):
+        """
+        Get metrics performances and save.
+        :return:
+        """
+        self.sim_performance_dominant(group='both')
+        self.stats_before_after()
+        # self.cluster_before_after()
+
+
     def flatten_sim(self):
         ## TODO
         # This will extract out the classes of df
@@ -146,6 +173,12 @@ def sim_performance_dominant(self, group='both'):
         self.dropout = dropout
         self.prec_scores = prec_scores
         self.rocs = rocs
+        pd.DataFrame([prec_scores, dropout], index=['Precision', 'Dropout']).to_csv(self.f_save_metrics, sep='\t')
+        self.metrics['prec_scores'] = prec_scores
+        self.metrics['dropout'] = dropout
+        self.metrics['rocs'] = rocs
+        pickle.dump(rocs, open(self.f_save_rocs, 'wb'))
+
         return
 
 
@@ -163,6 +196,8 @@ def stats_before_after(self, clone_id=1):
             b_a_df.at[iter, "After"] = (a_clones==clone_id).sum()
             b_a_df.at[iter,"A/B"] = (b_a_df.at[iter, "After"]/b_a_df.at[iter, "Before"])
         self.b_a_df = b_a_df
+        b_a_df.to_csv(self.f_save_befaft, sep='\t')
+        self.metrics['b_a_df'] = b_a_df
         return
 
 
@@ -207,10 +242,9 @@ def stats_cluster_before_after(self, clone_id=1):
         return
 
 
-
     def save(self, f_save=None):
         if f_save is None:
-            f_save = self.f_save
+            f_save = self.f_save_data
         f = open(f_save, 'wb')
         pickle.dump(self.__dict__, f, 2)
         f.close()

diff --git a/src/simulations/parametersweep.py b/src/simulations/parametersweep.py
@@ -60,29 +60,37 @@ def __init__(self, default_params_f, sweep_params_f):
         :type str
 
         """
-        self.sweep_params_f = sweep_params_f
-        self.default_params_f = default_params_f
         params = read_config_file(default_params_f)
         sweep_params = read_config_file(sweep_params_f)
+        self.sweep_params_f = sweep_params_f
+        self.default_params_f = default_params_f
         self.sweep_params = sweep_params
         self.default_params = params
 
         # Create the yaml files in the directory indicated by local_outdir and prefix in sweep_params_f
         self.outdir = os.path.join(sweep_params["outdir"], sweep_params["prefix"])
         if not os.path.exists(self.outdir):
             os.makedirs(self.outdir)
-
         self.f_save = \
             os.path.join(self.outdir,self.sweep_params['prefix'] + '.p')
         self.tmp_f_save = self.f_save.replace('.p', '') + '_tmp.p'
 
+        self.data_outdir = os.path.join(sweep_params['data_outdir'], sweep_params["prefix"])
+        if not os.path.exists(self.data_outdir):
+            os.makedirs(self.data_outdir)
+
         # Create a grid.
         # Loop through and set each parameter in the the params grid parameter.
         self.params_df = pd.DataFrame(list(ParameterGrid(sweep_params['grid'])))
         params_dict = dict()
 
         for ind, val in self.params_df.iterrows():
             f_name = os.path.join(self.outdir, str(ind)+'.yaml' )
+            # Create the name
+            params['name'] = str(ind)
+            params['data_outdir'] = self.data_outdir
+            params['local_outdir'] = self.sweep_params["outdir"]
+            params['prefix'] = self.sweep_params["prefix"]
             for name, v in val.iteritems():
                 # Set the specific variables that need to be updated
                 if name == 'dominant_clone_sizes':
@@ -101,33 +109,54 @@ def __init__(self, default_params_f, sweep_params_f):
             params_dict[f_name] = params
         return
 
+
+    @staticmethod
+    def run_single_sweep(f, outdir):
+        print(f'Running file {f}')
+        params_f = os.path.join(outdir, str(f) + '.yaml')
+        sim = FullSimulation(params_f)
+        sim.run()
+        sim.run_metrics()
+        sim.save()
+        return sim.metrics
+
     def run_sweep(self, subset=None):
-        """
-        Loops through self.params_df and runs the simulation on that
-        parameter.
+        """ Loops through params_df and runs the simulation on that parameter.
 
+        Each run will be saved, and not stored. However, the metrics
+        will be stored here.
         :param subset:  Which files to run the simulation. If list,
         list of files to run on, if int, number of files to randomly
         choose. If None, run on all.
         :type subset: int or None or list (default=None)
-
         """
-
         params_df = self.params_df
         if isinstance(subset, int):
             params_df = params_df.sample(n=subset)
 
-        sweep_results = dict()
-        for f, val in params_df.iterrows():
-            params_f = os.path.join(self.outdir, str(f) +'.yaml')
-            print(f"Running with file: {f}")
-            sim = FullSimulation(params_f)
-            sim.run()
-            sweep_results[f] = sim
-
-            # Save temp file for last track.
-            self.sweep_results = sweep_results
-            self.save(f_save=self.tmp_f_save)
+        sweep_results_df = pd.Series(index=params_df.index, data=params_df.index)
+        ###
+        #sweep_results_df = sweep_results_df.apply(self.run_single_sweep, args=(self.outdir,))
+        pandarallel.initialize(nb_workers=self.sweep_params['cpus'])
+        sweep_results_df = sweep_results_df.parallel_apply(self.run_single_sweep, args=(self.outdir,))
+        ###
+
+        self.sweep_results = sweep_results_df
+
+        # sweep_results = dict()
+        #
+        # for f, val in params_df.iterrows():
+        #     params_f = os.path.join(self.outdir, str(f) +'.yaml')
+        #     print(f"Running with file: {f}")
+        #     sim = FullSimulation(params_f)
+        #     sim.run()
+        #     sim.run_metrics()
+        #     sim.save()
+        #
+        #     # Only store the current metrics, not the entire simulation.
+        #     sweep_results[f] = sim.metrics
+        #     #self.save(f_save=self.tmp_f_save)
+        # self.sweep_results = sweep_results
         return
 
 
@@ -149,15 +178,17 @@ def plot_sensitivity_and_dropout(self):
 
         # Add these results to self.results, which has the meta information too
         for ind, val in self.params_df.iterrows():
-            full_sim = self.sweep_results[ind]
-            dropout = full_sim.dropout
-            prec_scores = full_sim.prec_scores
-            rocs = full_sim.rocs
+            curr_results = self.sweep_results[ind]
+            dropout = curr_results['dropout']
+            prec_scores = curr_results['prec_scores']
+            rocs = curr_results['rocs']
 
             metrics.at[ind, 'Avg. Precision'] = np.mean(prec_scores)
             metrics.at[ind, '% dropout'] = np.mean(dropout)
 
         self.metrics = metrics
+        """df that contains simulation performance metric.
+        """
         # Seaborn Factorplot
         g = sns.FacetGrid(data=metrics, col="het_err_rate", hue="cov_constant")
         g.map(sns.scatterplot, "dominant_het", "Avg. Precision")
@@ -193,8 +224,8 @@ def plot_before_after_all_clones(self):
         """
         df_full = []
         for ind, val in self.params_df.iterrows():
-            full_sim = self.sweep_results[ind]
-            b_a_df = full_sim.b_a_df
+            curr_results = self.sweep_results[ind]
+            b_a_df = curr_results['b_a_df']
             # Each element will be a list of the values.
             s = self.params_df.loc[ind].copy()
             curr = pd.DataFrame([s.copy()] * len(b_a_df))