Wrapper for sim. Check if saved already and if yes load. Cluster befo…

…re and after violinplot.
LewisLabUCSD · Oct 26, 2020 · 1b49ccc · 1b49ccc
1 parent 772b7e7
commit 1b49ccc
Show file tree

Hide file tree

Showing 17 changed files with 294 additions and 37 deletions.
diff --git a/parameters/simulation/Icon → parameters/simulations/Icon b/parameters/simulation/Icon → parameters/simulations/Icon
diff --git a/parameters/simulation/clones.yaml → parameters/simulations/clones.yaml b/parameters/simulation/clones.yaml → parameters/simulations/clones.yaml
diff --git a/parameters/simulation/mt_rates.yaml → parameters/simulations/mt_rates.yaml b/parameters/simulation/mt_rates.yaml → parameters/simulations/mt_rates.yaml
diff --git a/parameters/simulation/simple.yaml → parameters/simulations/simple.yaml b/parameters/simulation/simple.yaml → parameters/simulations/simple.yaml
diff --git a/parameters/simulations/simple_v01.yaml b/parameters/simulations/simple_v01.yaml
@@ -0,0 +1,42 @@
+# One set of parameters for simulating cells and their lineages
+local_outdir: 'simulation'
+prefix: 'sweep_v01'
+num_iterations: 1000
+num_cells: 10000
+num_mt_positions: 10
+
+cpus: 36
+
+initialize:
+  coverage:
+    type: 'constant' # {'poisson', 'constant', 'growth'}
+    cov_constant: 50
+    poisson_mu_per_cell: 50
+
+  clone_sizes: #distribution of clone sizes. The first one is the non-clone population. Length should be len(het)+1, and it should sum to 1
+    [0.5, 0.4, 0.1]
+
+  mt_clone_map:
+    # [(10), (16500)]
+    # [(10), (16500), (16500, 7)]
+
+het: # Heterozygous values
+  [0.2, 0.2]
+het_err_rate: 0.05
+
+
+sequence_subsample: #10000
+
+growth:
+  type: binomial
+  binomial:
+    rates: [0.1, 0.8, 0.01]
+    time_steps: 10
+  #type: poisson
+  poisson:
+    rates: [0.05, 1, 0.9]
+  mutant_af_sigma_noise: 0.05
+  non_mutants:
+    to_mutate: False
+
+
diff --git a/parameters/simulation/sweep_v01.yaml → parameters/simulations/sweep_v01.yaml b/parameters/simulation/sweep_v01.yaml → parameters/simulations/sweep_v01.yaml
@@ -1,5 +1,5 @@
 # One set of parameters for simulating cells and their lineages
-local_outdir: 'simulation'
+outdir: 'simulation'
 prefix: 'sweep_v01'
 #num_iterations: 100
 #num_cells: 10000
@@ -11,12 +11,3 @@ grid:
   dominant_het: [0.02, 0.2, 0.5, 0.8]
   dominant_growth: [0.1, 0.5, 0.8]
   het_err_rate: [0.05, 0.15]
-
-#4*4*4*2*2 = 256
-
-grid2:
-  cov_constant: [5, 10, 100]
-  dominant_clone_sizes: [0.02,0.1,0.2, 0.4]
-  dominant_het: [0.02, 0.2, 0.5]
-  dominant_growth: [0.1, 0.5, 0.8]
-  het_err_rate: [0.05, 0.15]
diff --git a/parameters/simulation/test_sweep.yaml → parameters/simulations/test_sweep.yaml b/parameters/simulation/test_sweep.yaml → parameters/simulations/test_sweep.yaml
@@ -1,13 +1,13 @@
 # One set of parameters for simulating cells and their lineages
-local_outdir: 'simulation'
+outdir: 'simulation'
 prefix: 'test_sweep'
 #num_iterations: 100
 #num_cells: 10000
 #num_mt_positions: 10
 
 grid:
   cov_constant: [5, 100]
-  dominant_clone_sizes: [0.02,0.2, 0.4]
+  dominant_clone_sizes: [0.02,0.2]
   dominant_het: [0.02, 0.2]
   dominant_growth: [0.1, 0.5]
   het_err_rate: [0.05, 0.15]
diff --git a/src/simulations/fullsimulation.py b/src/simulations/fullsimulation.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from tqdm import tqdm
 #from src.config import ROOT_DIR
-from sklearn.metrics import roc_curve, average_precision_score
+from sklearn.metrics import roc_curve, average_precision_score, confusion_matrix
 import matplotlib.pyplot as plt
 import pickle
 import seaborn as sns
@@ -14,7 +14,6 @@
 from sklearn import metrics
 from scipy.spatial.distance import cdist
 from pandarallel import pandarallel
-pandarallel.initialize(nb_workers=32)
 
 from mplh.color_utils import get_colors
 from mplh.fig_utils import legend_from_color
@@ -29,11 +28,7 @@
 
 from .simulation import Simulation
 
-""" Run the simulation similar to in Extended Data Fig 3 from 
-    Massively parallel single-cell mitochondrial DNA genotyping and chromatin profiling"""
 
-
-# I can make each variable a class?
 # Does this ruin running the MCMC? I don't think so, b/c that format is going to be put in after anyway
 class FullSimulation:
     """
@@ -55,6 +50,7 @@ def __init__(self, params_f):
         self.n_iter = params['num_iterations']
         self.num_cells = params['num_cells']
         self.params = params
+        self.f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p')
         return
         #for i in self.n_iter:
 
@@ -67,12 +63,14 @@ def run(self):
         """
         # Parallelize df
         df = pd.Series(index=range(self.n_iter))
-        df = df.apply(self.run_sim, args=(self.params,))
-        #df = df.parallel_apply(self.run_sim, args=(self.params,))
+        #df = df.apply(self.run_sim, args=(self.params,))
+        pandarallel.initialize(nb_workers=self.params['cpus'])
+        df = df.parallel_apply(self.run_sim, args=(self.params,))
 
         self.sim = df
         #self.cluster_before_after()
         self.sim_performance_dominant(group='both')
+        self.stats_before_after()
         return
 
     @staticmethod
@@ -113,12 +111,14 @@ def sim_performance_dominant(self, group='both'):
         :type prec_scores: list
         :ivar rocs: ROC curves for each iteration based on allele
         frequencies.
+
         :return:
         """
         dropout = []
         rocs = []
         prec_scores = []
 
+
         for iter, s in enumerate(self.sim.values):
             # First get the dominant clone , which is indexed as 1
             mt_pos = s.clone_mt_dict[1]
@@ -154,6 +154,18 @@ def reduce_cells(self, cell_af):
         return
 
 
+    def stats_before_after(self, clone_id=1):
+        b_a_df = pd.DataFrame(index=np.arange(0,len(self.sim)), columns=["Before", "After", "A/B"], dtype=str)
+        for iter, s in enumerate(self.sim.values):
+            b_clones = s.clone_cell
+            a_clones = s.subsample_new_clone_cell
+            b_a_df.at[iter, "Before"] = (b_clones == clone_id).sum()
+            b_a_df.at[iter, "After"] = (a_clones==clone_id).sum()
+            b_a_df.at[iter,"A/B"] = (b_a_df.at[iter, "After"]/b_a_df.at[iter, "Before"])
+        self.b_a_df = b_a_df
+        return
+
+
     def cluster_before_after(self):
         """
         Loops through the simulations and for each,
@@ -172,16 +184,43 @@ def cluster_before_after(self):
             print(len(cluster_results[-1]))
         self.cluster_results = cluster_results
 
+
+    def stats_cluster_before_after(self, clone_id=1):
+        """
+        Confusion matrix for clustering the proper clone cells together.
+        :param clone_id: Which clone to get metrics for
+        :return:
+        """
+
+
+        b_a_df = pd.DataFrame(index=len(self.sim),
+                              columns=["TN", "FP", "FN", "TP"], dtype=int)
+        for ind, s in enumerate(self.sim.values):
+            y_true = s.combined_clones
+            y_true[y_true!=1] = 0
+            y_pred = self.cluster_results[ind]
+
+            # y_true, y_pred
+            tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
+            b_a_df.loc[ind] = [tn, fp, fn, tp]
+        self.b_a_df = b_a_df
+        return
+
+
+
     def save(self, f_save=None):
         if f_save is None:
-            f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p')
+            f_save = self.f_save
         f = open(f_save, 'wb')
         pickle.dump(self.__dict__, f, 2)
         f.close()
 
-    def load(self, filename):
+
+    def load(self, f_save=None):
         #filename = self.params['filename']
-        f = open(filename, 'rb')
+        if f_save is None:
+            f_save = self.f_save
+        f = open(f_save, 'rb')
         tmp_dict = pickle.load(f)
         f.close()
         self.__dict__.update(tmp_dict)

diff --git a/src/simulations/parametersweep.py b/src/simulations/parametersweep.py
@@ -6,6 +6,7 @@
 import seaborn as sns
 from pandarallel import pandarallel
 pandarallel.initialize(nb_workers=32)
+import matplotlib.pyplot as plt
 
 from mplh.color_utils import get_colors
 from mplh.fig_utils import legend_from_color
@@ -67,10 +68,14 @@ def __init__(self, default_params_f, sweep_params_f):
         self.default_params = params
 
         # Create the yaml files in the directory indicated by local_outdir and prefix in sweep_params_f
-        self.outdir = os.path.join(sweep_params["local_outdir"], sweep_params["prefix"])
+        self.outdir = os.path.join(sweep_params["outdir"], sweep_params["prefix"])
         if not os.path.exists(self.outdir):
             os.makedirs(self.outdir)
 
+        self.f_save = \
+            os.path.join(self.outdir,self.sweep_params['prefix'] + '.p')
+        self.tmp_f_save = self.f_save.replace('.p', '') + '_tmp.p'
+
         # Create a grid.
         # Loop through and set each parameter in the the params grid parameter.
         self.params_df = pd.DataFrame(list(ParameterGrid(sweep_params['grid'])))
@@ -119,7 +124,10 @@ def run_sweep(self, subset=None):
             sim = FullSimulation(params_f)
             sim.run()
             sweep_results[f] = sim
-        self.sweep_results = sweep_results
+
+            # Save temp file for last track.
+            self.sweep_results = sweep_results
+            self.save(f_save=self.tmp_f_save)
         return
 
 
@@ -162,46 +170,95 @@ def plot_sensitivity_and_dropout(self):
         g.savefig(os.path.join(self.outdir, 'dropout.png'))
         return None
 
-
     def plot_ppv(self):
         return
 
+
     def cluster_before_after(self):
         for f in self.sweep_results:
-            self.sweep_results[f].cluster()
+            self.sweep_results[f].cluster_before_after()
         return
 
+
     def plot_before_after_all_clones(self):
         """
-        Plot the growth of each clone.
+        Plot the growth of each clone. Boxplots with x-axis being
+        the growth rate for the dominant clone, the y-axis the
+        after/before ratio, the color the dominant clone cell size.
+
+        # Expand params_df to include each iteration, which will
+        # allow for violinplots. This will lead to
+        params_df.shape[0]*num_iterations rows.
         :return:
         """
-        return
+        df_full = []
+        for ind, val in self.params_df.iterrows():
+            full_sim = self.sweep_results[ind]
+            b_a_df = full_sim.b_a_df
+            # Each element will be a list of the values.
+            s = self.params_df.loc[ind].copy()
+            curr = pd.DataFrame([s.copy()] * len(b_a_df))
+            curr["A/B"] = b_a_df["A/B"].values
+            df_full.append(curr)
+
+        df_full = pd.concat(df_full)
+        df_full = df_full.astype(
+            {'dominant_growth': str, 'dominant_clone_sizes': str,
+             'A/B': float})
+
+        sns.violinplot(data=df_full, y="A/B", hue="dominant_growth",
+                       x="dominant_clone_sizes")
+        plt.savefig(os.path.join(self.outdir, 'growth_before_after.png'))
+        #g.savefig(os.path.join(self.outdir, 'precision.png'))
+        return None
 
     def plot_before_after_true_growth(self):
         """
-        Plot the growth of cells in each clone before and after
+        Plot the growth of cells in each clone before and after.
+        Dotplot/heatmap, where each row is a het value, each column
+        is growth rate, and the value is dominant clone cells
+        growth (after/before). Can make multiple where each column is
+        coverage, each row is error rate.
+
         :return:
         """
         return
 
     def plot_before_after_cluster_growth(self):
         """
-        Plot the growth of the predicted dominant clone based on clustering
-        results and picking the top clone.
+        Plot the growth of cells in each clone before and after.
+        Dotplot/heatmap, where each row is a het value, each column
+        is growth rate, and the value is dominant clone cell cluster
+        growth (after/before). Can make multiple where each column is
+        coverage, each row is error rate.
         :return:
         """
         return
 
-    def save(self):
-        f_save = os.path.join(self.outdir, self.sweep_params['prefix']+'.p')
+
+    def plot_error_cluster_growth(self):
+        """
+        Plot the deviation of the growth rates for each pipeline run.
+        If A=true growth rate, B=predicted, (C = A-B)
+        :return:
+        """
+
+    def delete_tmp(self):
+        if os.path.exists(self.tmp_f_save):
+            os.remove(self.tmp_f_save)
+
+    def save(self, f_save=None):
+        if f_save is None:
+            f_save = self.f_save
         f = open(f_save, 'wb')
         pickle.dump(self.__dict__, f, 2)
         f.close()
 
-    def load(self, filename):
+    def load(self, f_save=None):
+        if f_save is None:
+            f_save = self.f_save
         #filename = self.params['filename']
-        f = open(filename, 'rb')
+        f = open(f_save, 'rb')
         tmp_dict = pickle.load(f)
         f.close()
         self.__dict__.update(tmp_dict)
diff --git a/src/simulations/simulation.py b/src/simulations/simulation.py
@@ -330,7 +330,7 @@ def combine_init_growth(self):
         assert (combined_cell_af.shape[0] == self.cell_af.shape[0] +
                 self.subsample_new_cell_af.shape[0])
         assert(combined_meta.shape[0] == clones.shape[0])
-        assert (combined_cell_af.shape[0] == clones.shape[0])
+        assert(combined_cell_af.shape[0] == clones.shape[0])
         self.combined_meta = combined_meta
         self.combined_clones = clones
         self.combined_cell_af = combined_cell_af
@@ -361,7 +361,11 @@ def compare_before_after(self):
         Creates a df that contains information on
         the number of cells from each clone before as well as after.
         :return:
+        df.at[ind, "Dominant Before"] = (full_sim.clone_cell == 1).sum()
+        df.at[ind, "Dominant After"] =  (full_sim.subsample_new_clone_cell == 1).sum()
+
         """
+
         return
 
     def cluster_compare_before_after(self):
@@ -413,6 +417,7 @@ def cluster_kmeans(cell_af):
 
 
 
+
 def main():
     return