diff --git "a/parameters/simulation/Icon\r" "b/parameters/simulations/Icon\r" similarity index 100% rename from "parameters/simulation/Icon\r" rename to "parameters/simulations/Icon\r" diff --git a/parameters/simulation/clones.yaml b/parameters/simulations/clones.yaml similarity index 100% rename from parameters/simulation/clones.yaml rename to parameters/simulations/clones.yaml diff --git a/parameters/simulation/mt_rates.yaml b/parameters/simulations/mt_rates.yaml similarity index 100% rename from parameters/simulation/mt_rates.yaml rename to parameters/simulations/mt_rates.yaml diff --git a/parameters/simulation/simple.yaml b/parameters/simulations/simple.yaml similarity index 100% rename from parameters/simulation/simple.yaml rename to parameters/simulations/simple.yaml diff --git a/parameters/simulations/simple_v01.yaml b/parameters/simulations/simple_v01.yaml new file mode 100644 index 00000000..85bf6f30 --- /dev/null +++ b/parameters/simulations/simple_v01.yaml @@ -0,0 +1,42 @@ +# One set of parameters for simulating cells and their lineages +local_outdir: 'simulation' +prefix: 'sweep_v01' +num_iterations: 1000 +num_cells: 10000 +num_mt_positions: 10 + +cpus: 36 + +initialize: + coverage: + type: 'constant' # {'poisson', 'constant', 'growth'} + cov_constant: 50 + poisson_mu_per_cell: 50 + + clone_sizes: #distribution of clone sizes. The first one is the non-clone population. Length should be len(het)+1, and it should sum to 1 + [0.5, 0.4, 0.1] + + mt_clone_map: + # [(10), (16500)] + # [(10), (16500), (16500, 7)] + +het: # Heterozygous values + [0.2, 0.2] +het_err_rate: 0.05 + + +sequence_subsample: #10000 + +growth: + type: binomial + binomial: + rates: [0.1, 0.8, 0.01] + time_steps: 10 + #type: poisson + poisson: + rates: [0.05, 1, 0.9] + mutant_af_sigma_noise: 0.05 + non_mutants: + to_mutate: False + + diff --git a/parameters/simulation/sweep_v01.yaml b/parameters/simulations/sweep_v01.yaml similarity index 59% rename from parameters/simulation/sweep_v01.yaml rename to parameters/simulations/sweep_v01.yaml index a40aa4ba..940e5023 100644 --- a/parameters/simulation/sweep_v01.yaml +++ b/parameters/simulations/sweep_v01.yaml @@ -1,5 +1,5 @@ # One set of parameters for simulating cells and their lineages -local_outdir: 'simulation' +outdir: 'simulation' prefix: 'sweep_v01' #num_iterations: 100 #num_cells: 10000 @@ -11,12 +11,3 @@ grid: dominant_het: [0.02, 0.2, 0.5, 0.8] dominant_growth: [0.1, 0.5, 0.8] het_err_rate: [0.05, 0.15] - -#4*4*4*2*2 = 256 - -grid2: - cov_constant: [5, 10, 100] - dominant_clone_sizes: [0.02,0.1,0.2, 0.4] - dominant_het: [0.02, 0.2, 0.5] - dominant_growth: [0.1, 0.5, 0.8] - het_err_rate: [0.05, 0.15] diff --git a/parameters/simulation/test_sweep.yaml b/parameters/simulations/test_sweep.yaml similarity index 79% rename from parameters/simulation/test_sweep.yaml rename to parameters/simulations/test_sweep.yaml index b292fe8c..3c968b63 100644 --- a/parameters/simulation/test_sweep.yaml +++ b/parameters/simulations/test_sweep.yaml @@ -1,5 +1,5 @@ # One set of parameters for simulating cells and their lineages -local_outdir: 'simulation' +outdir: 'simulation' prefix: 'test_sweep' #num_iterations: 100 #num_cells: 10000 @@ -7,7 +7,7 @@ prefix: 'test_sweep' grid: cov_constant: [5, 100] - dominant_clone_sizes: [0.02,0.2, 0.4] + dominant_clone_sizes: [0.02,0.2] dominant_het: [0.02, 0.2] dominant_growth: [0.1, 0.5] het_err_rate: [0.05, 0.15] \ No newline at end of file diff --git a/src/simulations/fullsimulation.py b/src/simulations/fullsimulation.py index f59a127e..144afaea 100644 --- a/src/simulations/fullsimulation.py +++ b/src/simulations/fullsimulation.py @@ -5,7 +5,7 @@ import pandas as pd from tqdm import tqdm #from src.config import ROOT_DIR -from sklearn.metrics import roc_curve, average_precision_score +from sklearn.metrics import roc_curve, average_precision_score, confusion_matrix import matplotlib.pyplot as plt import pickle import seaborn as sns @@ -14,7 +14,6 @@ from sklearn import metrics from scipy.spatial.distance import cdist from pandarallel import pandarallel -pandarallel.initialize(nb_workers=32) from mplh.color_utils import get_colors from mplh.fig_utils import legend_from_color @@ -29,11 +28,7 @@ from .simulation import Simulation -""" Run the simulation similar to in Extended Data Fig 3 from - Massively parallel single-cell mitochondrial DNA genotyping and chromatin profiling""" - -# I can make each variable a class? # Does this ruin running the MCMC? I don't think so, b/c that format is going to be put in after anyway class FullSimulation: """ @@ -55,6 +50,7 @@ def __init__(self, params_f): self.n_iter = params['num_iterations'] self.num_cells = params['num_cells'] self.params = params + self.f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p') return #for i in self.n_iter: @@ -67,12 +63,14 @@ def run(self): """ # Parallelize df df = pd.Series(index=range(self.n_iter)) - df = df.apply(self.run_sim, args=(self.params,)) - #df = df.parallel_apply(self.run_sim, args=(self.params,)) + #df = df.apply(self.run_sim, args=(self.params,)) + pandarallel.initialize(nb_workers=self.params['cpus']) + df = df.parallel_apply(self.run_sim, args=(self.params,)) self.sim = df #self.cluster_before_after() self.sim_performance_dominant(group='both') + self.stats_before_after() return @staticmethod @@ -113,12 +111,14 @@ def sim_performance_dominant(self, group='both'): :type prec_scores: list :ivar rocs: ROC curves for each iteration based on allele frequencies. + :return: """ dropout = [] rocs = [] prec_scores = [] + for iter, s in enumerate(self.sim.values): # First get the dominant clone , which is indexed as 1 mt_pos = s.clone_mt_dict[1] @@ -154,6 +154,18 @@ def reduce_cells(self, cell_af): return + def stats_before_after(self, clone_id=1): + b_a_df = pd.DataFrame(index=np.arange(0,len(self.sim)), columns=["Before", "After", "A/B"], dtype=str) + for iter, s in enumerate(self.sim.values): + b_clones = s.clone_cell + a_clones = s.subsample_new_clone_cell + b_a_df.at[iter, "Before"] = (b_clones == clone_id).sum() + b_a_df.at[iter, "After"] = (a_clones==clone_id).sum() + b_a_df.at[iter,"A/B"] = (b_a_df.at[iter, "After"]/b_a_df.at[iter, "Before"]) + self.b_a_df = b_a_df + return + + def cluster_before_after(self): """ Loops through the simulations and for each, @@ -172,16 +184,43 @@ def cluster_before_after(self): print(len(cluster_results[-1])) self.cluster_results = cluster_results + + def stats_cluster_before_after(self, clone_id=1): + """ + Confusion matrix for clustering the proper clone cells together. + :param clone_id: Which clone to get metrics for + :return: + """ + + + b_a_df = pd.DataFrame(index=len(self.sim), + columns=["TN", "FP", "FN", "TP"], dtype=int) + for ind, s in enumerate(self.sim.values): + y_true = s.combined_clones + y_true[y_true!=1] = 0 + y_pred = self.cluster_results[ind] + + # y_true, y_pred + tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() + b_a_df.loc[ind] = [tn, fp, fn, tp] + self.b_a_df = b_a_df + return + + + def save(self, f_save=None): if f_save is None: - f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p') + f_save = self.f_save f = open(f_save, 'wb') pickle.dump(self.__dict__, f, 2) f.close() - def load(self, filename): + + def load(self, f_save=None): #filename = self.params['filename'] - f = open(filename, 'rb') + if f_save is None: + f_save = self.f_save + f = open(f_save, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) diff --git a/src/simulations/parametersweep.py b/src/simulations/parametersweep.py index 0aba9b1b..a9e13603 100644 --- a/src/simulations/parametersweep.py +++ b/src/simulations/parametersweep.py @@ -6,6 +6,7 @@ import seaborn as sns from pandarallel import pandarallel pandarallel.initialize(nb_workers=32) +import matplotlib.pyplot as plt from mplh.color_utils import get_colors from mplh.fig_utils import legend_from_color @@ -67,10 +68,14 @@ def __init__(self, default_params_f, sweep_params_f): self.default_params = params # Create the yaml files in the directory indicated by local_outdir and prefix in sweep_params_f - self.outdir = os.path.join(sweep_params["local_outdir"], sweep_params["prefix"]) + self.outdir = os.path.join(sweep_params["outdir"], sweep_params["prefix"]) if not os.path.exists(self.outdir): os.makedirs(self.outdir) + self.f_save = \ + os.path.join(self.outdir,self.sweep_params['prefix'] + '.p') + self.tmp_f_save = self.f_save.replace('.p', '') + '_tmp.p' + # Create a grid. # Loop through and set each parameter in the the params grid parameter. self.params_df = pd.DataFrame(list(ParameterGrid(sweep_params['grid']))) @@ -119,7 +124,10 @@ def run_sweep(self, subset=None): sim = FullSimulation(params_f) sim.run() sweep_results[f] = sim - self.sweep_results = sweep_results + + # Save temp file for last track. + self.sweep_results = sweep_results + self.save(f_save=self.tmp_f_save) return @@ -162,46 +170,95 @@ def plot_sensitivity_and_dropout(self): g.savefig(os.path.join(self.outdir, 'dropout.png')) return None - def plot_ppv(self): return + def cluster_before_after(self): for f in self.sweep_results: - self.sweep_results[f].cluster() + self.sweep_results[f].cluster_before_after() return + def plot_before_after_all_clones(self): """ - Plot the growth of each clone. + Plot the growth of each clone. Boxplots with x-axis being + the growth rate for the dominant clone, the y-axis the + after/before ratio, the color the dominant clone cell size. + + # Expand params_df to include each iteration, which will + # allow for violinplots. This will lead to + params_df.shape[0]*num_iterations rows. :return: """ - return + df_full = [] + for ind, val in self.params_df.iterrows(): + full_sim = self.sweep_results[ind] + b_a_df = full_sim.b_a_df + # Each element will be a list of the values. + s = self.params_df.loc[ind].copy() + curr = pd.DataFrame([s.copy()] * len(b_a_df)) + curr["A/B"] = b_a_df["A/B"].values + df_full.append(curr) + + df_full = pd.concat(df_full) + df_full = df_full.astype( + {'dominant_growth': str, 'dominant_clone_sizes': str, + 'A/B': float}) + + sns.violinplot(data=df_full, y="A/B", hue="dominant_growth", + x="dominant_clone_sizes") + plt.savefig(os.path.join(self.outdir, 'growth_before_after.png')) + #g.savefig(os.path.join(self.outdir, 'precision.png')) + return None def plot_before_after_true_growth(self): """ - Plot the growth of cells in each clone before and after + Plot the growth of cells in each clone before and after. + Dotplot/heatmap, where each row is a het value, each column + is growth rate, and the value is dominant clone cells + growth (after/before). Can make multiple where each column is + coverage, each row is error rate. + :return: """ return def plot_before_after_cluster_growth(self): """ - Plot the growth of the predicted dominant clone based on clustering - results and picking the top clone. + Plot the growth of cells in each clone before and after. + Dotplot/heatmap, where each row is a het value, each column + is growth rate, and the value is dominant clone cell cluster + growth (after/before). Can make multiple where each column is + coverage, each row is error rate. :return: """ return - def save(self): - f_save = os.path.join(self.outdir, self.sweep_params['prefix']+'.p') + + def plot_error_cluster_growth(self): + """ + Plot the deviation of the growth rates for each pipeline run. + If A=true growth rate, B=predicted, (C = A-B) + :return: + """ + + def delete_tmp(self): + if os.path.exists(self.tmp_f_save): + os.remove(self.tmp_f_save) + + def save(self, f_save=None): + if f_save is None: + f_save = self.f_save f = open(f_save, 'wb') pickle.dump(self.__dict__, f, 2) f.close() - def load(self, filename): + def load(self, f_save=None): + if f_save is None: + f_save = self.f_save #filename = self.params['filename'] - f = open(filename, 'rb') + f = open(f_save, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) \ No newline at end of file diff --git a/src/simulations/simulation.py b/src/simulations/simulation.py index 1339d110..ac84e914 100644 --- a/src/simulations/simulation.py +++ b/src/simulations/simulation.py @@ -330,7 +330,7 @@ def combine_init_growth(self): assert (combined_cell_af.shape[0] == self.cell_af.shape[0] + self.subsample_new_cell_af.shape[0]) assert(combined_meta.shape[0] == clones.shape[0]) - assert (combined_cell_af.shape[0] == clones.shape[0]) + assert(combined_cell_af.shape[0] == clones.shape[0]) self.combined_meta = combined_meta self.combined_clones = clones self.combined_cell_af = combined_cell_af @@ -361,7 +361,11 @@ def compare_before_after(self): Creates a df that contains information on the number of cells from each clone before as well as after. :return: + df.at[ind, "Dominant Before"] = (full_sim.clone_cell == 1).sum() + df.at[ind, "Dominant After"] = (full_sim.subsample_new_clone_cell == 1).sum() + """ + return def cluster_compare_before_after(self): @@ -413,6 +417,7 @@ def cluster_kmeans(cell_af): + def main(): return diff --git a/src/simulations/wrapper_v01.py b/src/simulations/wrapper_v01.py new file mode 100644 index 00000000..02e20e92 --- /dev/null +++ b/src/simulations/wrapper_v01.py @@ -0,0 +1,30 @@ +""" +Running the first version of the lineage tracing simulation. +Files are simple_v01.yaml and wrapper_v01.yaml. +1000 iterations, 10000 num_cells +""" + +from src.simulations import Simulation, ParameterSweep, FullSimulation +from src.simulations.utils.config import read_config_file +from src.config import RESULTS, ROOT_DIR +import time +from os.path import join +import os + +os.chdir(RESULTS) + +default_params_f = join(ROOT_DIR, 'parameters/simulations/simple_v01.yaml') +sweep_params_f = join(ROOT_DIR, 'parameters/simulations/sweep_v01.yaml') +sweep = ParameterSweep(default_params_f, sweep_params_f) + +if hasattr(sweep, 'f_save'): + if os.path.exists(sweep.f_save): + print('Already ran. Loading') + sweep.load() + else: + sweep.run_sweep() # Runs the simulation + sweep.save() +else: + print('no attr') + sweep.run_sweep() # Runs the simulation + sweep.save() diff --git "a/src/test/Icon\r" "b/src/tests/Icon\r" similarity index 100% rename from "src/test/Icon\r" rename to "src/tests/Icon\r" diff --git a/src/test/__init__.py b/src/tests/__init__.py similarity index 100% rename from src/test/__init__.py rename to src/tests/__init__.py diff --git a/src/tests/simulations/__init__.py b/src/tests/simulations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/tests/simulations/test_fullsimulation.py b/src/tests/simulations/test_fullsimulation.py new file mode 100644 index 00000000..5ab56e61 --- /dev/null +++ b/src/tests/simulations/test_fullsimulation.py @@ -0,0 +1,25 @@ +from src.simulations import Simulation, ParameterSweep, FullSimulation +from src.simulations.utils.config import read_config_file +from src.config import RESULTS, ROOT_DIR +import time +from os.path import join +import os +import unittest + + +os.chdir(RESULTS) +params = os.path.join(ROOT_DIR, 'parameters/simulations/simple.yaml') +#params = read_config_file(params) + +class TestSum(unittest.TestCase): + def test_iter(self): + t = time.time() + sim = FullSimulation(params) + sim.run() + print("time", time.time()-t) + sim.save() + return sim + +if __name__ == '__main__': + unittest.main() + diff --git a/src/tests/simulations/test_parametersweep.py b/src/tests/simulations/test_parametersweep.py new file mode 100644 index 00000000..c4fda990 --- /dev/null +++ b/src/tests/simulations/test_parametersweep.py @@ -0,0 +1,43 @@ +from src.simulations import Simulation, ParameterSweep, FullSimulation +from src.simulations.utils.config import read_config_file +from src.config import RESULTS, ROOT_DIR +import time +from os.path import join +import os +import unittest + + +os.chdir(RESULTS) +params = os.path.join(ROOT_DIR, 'parameters/simulations/simple.yaml') +#params = read_config_file(params) + + +class TestSum(unittest.TestCase): + def test_sweepparams(self): + default_params_f = join(ROOT_DIR, + 'parameters/simulations/simple.yaml') + sweep_params_f = join(ROOT_DIR, + 'parameters/simulations/test_sweep.yaml') + sweep = ParameterSweep(default_params_f, sweep_params_f) + if hasattr(sweep, 'f_save'): + if os.path.exists(sweep.f_save): + print('Already ran. Loading') + sweep.load() + else: + sweep.run_sweep() # Runs the simulation + sweep.save() + else: + print('no attr') + sweep.run_sweep() # Runs the simulation + sweep.save() + x = sweep.plot_before_after_all_clones() + self.assertTrue(x is None) + + x = sweep.plot_sensitivity_and_dropout() # plots results + self.assertTrue(x is None) + + +if __name__ == '__main__': + unittest.main() + + diff --git a/src/tests/simulations/test_simulation.py b/src/tests/simulations/test_simulation.py new file mode 100644 index 00000000..2f7789e2 --- /dev/null +++ b/src/tests/simulations/test_simulation.py @@ -0,0 +1,25 @@ +from src.simulations import Simulation +from src.simulations.utils.config import read_config_file +from src.config import RESULTS, ROOT_DIR +import time +from os.path import join +import os +import unittest + + +os.chdir(RESULTS) +params = os.path.join(ROOT_DIR, 'parameters/simulations/simple.yaml') +#params = read_config_file(params) + +class TestSum(unittest.TestCase): + def test_oneIter(self): + s = Simulation(params) + s.initialize() + s.grow() + s.subsample_new(to_delete=False) + s.save(f_save='simulation/simple_oneIter.p') + return + +if __name__ == '__main__': + unittest.main() +