Skip to content

Commit

Permalink
Wrapper for sim. Check if saved already and if yes load. Cluster befo…
Browse files Browse the repository at this point in the history
…re and after violinplot.
  • Loading branch information
Isaac Shamie committed Oct 26, 2020
1 parent 772b7e7 commit 1b49ccc
Show file tree
Hide file tree
Showing 17 changed files with 294 additions and 37 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
42 changes: 42 additions & 0 deletions parameters/simulations/simple_v01.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# One set of parameters for simulating cells and their lineages
local_outdir: 'simulation'
prefix: 'sweep_v01'
num_iterations: 1000
num_cells: 10000
num_mt_positions: 10

cpus: 36

initialize:
coverage:
type: 'constant' # {'poisson', 'constant', 'growth'}
cov_constant: 50
poisson_mu_per_cell: 50

clone_sizes: #distribution of clone sizes. The first one is the non-clone population. Length should be len(het)+1, and it should sum to 1
[0.5, 0.4, 0.1]

mt_clone_map:
# [(10), (16500)]
# [(10), (16500), (16500, 7)]

het: # Heterozygous values
[0.2, 0.2]
het_err_rate: 0.05


sequence_subsample: #10000

growth:
type: binomial
binomial:
rates: [0.1, 0.8, 0.01]
time_steps: 10
#type: poisson
poisson:
rates: [0.05, 1, 0.9]
mutant_af_sigma_noise: 0.05
non_mutants:
to_mutate: False


Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# One set of parameters for simulating cells and their lineages
local_outdir: 'simulation'
outdir: 'simulation'
prefix: 'sweep_v01'
#num_iterations: 100
#num_cells: 10000
Expand All @@ -11,12 +11,3 @@ grid:
dominant_het: [0.02, 0.2, 0.5, 0.8]
dominant_growth: [0.1, 0.5, 0.8]
het_err_rate: [0.05, 0.15]

#4*4*4*2*2 = 256

grid2:
cov_constant: [5, 10, 100]
dominant_clone_sizes: [0.02,0.1,0.2, 0.4]
dominant_het: [0.02, 0.2, 0.5]
dominant_growth: [0.1, 0.5, 0.8]
het_err_rate: [0.05, 0.15]
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# One set of parameters for simulating cells and their lineages
local_outdir: 'simulation'
outdir: 'simulation'
prefix: 'test_sweep'
#num_iterations: 100
#num_cells: 10000
#num_mt_positions: 10

grid:
cov_constant: [5, 100]
dominant_clone_sizes: [0.02,0.2, 0.4]
dominant_clone_sizes: [0.02,0.2]
dominant_het: [0.02, 0.2]
dominant_growth: [0.1, 0.5]
het_err_rate: [0.05, 0.15]
61 changes: 50 additions & 11 deletions src/simulations/fullsimulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
from tqdm import tqdm
#from src.config import ROOT_DIR
from sklearn.metrics import roc_curve, average_precision_score
from sklearn.metrics import roc_curve, average_precision_score, confusion_matrix
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
Expand All @@ -14,7 +14,6 @@
from sklearn import metrics
from scipy.spatial.distance import cdist
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=32)

from mplh.color_utils import get_colors
from mplh.fig_utils import legend_from_color
Expand All @@ -29,11 +28,7 @@

from .simulation import Simulation

""" Run the simulation similar to in Extended Data Fig 3 from
Massively parallel single-cell mitochondrial DNA genotyping and chromatin profiling"""


# I can make each variable a class?
# Does this ruin running the MCMC? I don't think so, b/c that format is going to be put in after anyway
class FullSimulation:
"""
Expand All @@ -55,6 +50,7 @@ def __init__(self, params_f):
self.n_iter = params['num_iterations']
self.num_cells = params['num_cells']
self.params = params
self.f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p')
return
#for i in self.n_iter:

Expand All @@ -67,12 +63,14 @@ def run(self):
"""
# Parallelize df
df = pd.Series(index=range(self.n_iter))
df = df.apply(self.run_sim, args=(self.params,))
#df = df.parallel_apply(self.run_sim, args=(self.params,))
#df = df.apply(self.run_sim, args=(self.params,))
pandarallel.initialize(nb_workers=self.params['cpus'])
df = df.parallel_apply(self.run_sim, args=(self.params,))

self.sim = df
#self.cluster_before_after()
self.sim_performance_dominant(group='both')
self.stats_before_after()
return

@staticmethod
Expand Down Expand Up @@ -113,12 +111,14 @@ def sim_performance_dominant(self, group='both'):
:type prec_scores: list
:ivar rocs: ROC curves for each iteration based on allele
frequencies.
:return:
"""
dropout = []
rocs = []
prec_scores = []


for iter, s in enumerate(self.sim.values):
# First get the dominant clone , which is indexed as 1
mt_pos = s.clone_mt_dict[1]
Expand Down Expand Up @@ -154,6 +154,18 @@ def reduce_cells(self, cell_af):
return


def stats_before_after(self, clone_id=1):
b_a_df = pd.DataFrame(index=np.arange(0,len(self.sim)), columns=["Before", "After", "A/B"], dtype=str)
for iter, s in enumerate(self.sim.values):
b_clones = s.clone_cell
a_clones = s.subsample_new_clone_cell
b_a_df.at[iter, "Before"] = (b_clones == clone_id).sum()
b_a_df.at[iter, "After"] = (a_clones==clone_id).sum()
b_a_df.at[iter,"A/B"] = (b_a_df.at[iter, "After"]/b_a_df.at[iter, "Before"])
self.b_a_df = b_a_df
return


def cluster_before_after(self):
"""
Loops through the simulations and for each,
Expand All @@ -172,16 +184,43 @@ def cluster_before_after(self):
print(len(cluster_results[-1]))
self.cluster_results = cluster_results


def stats_cluster_before_after(self, clone_id=1):
"""
Confusion matrix for clustering the proper clone cells together.
:param clone_id: Which clone to get metrics for
:return:
"""


b_a_df = pd.DataFrame(index=len(self.sim),
columns=["TN", "FP", "FN", "TP"], dtype=int)
for ind, s in enumerate(self.sim.values):
y_true = s.combined_clones
y_true[y_true!=1] = 0
y_pred = self.cluster_results[ind]

# y_true, y_pred
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
b_a_df.loc[ind] = [tn, fp, fn, tp]
self.b_a_df = b_a_df
return



def save(self, f_save=None):
if f_save is None:
f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p')
f_save = self.f_save
f = open(f_save, 'wb')
pickle.dump(self.__dict__, f, 2)
f.close()

def load(self, filename):

def load(self, f_save=None):
#filename = self.params['filename']
f = open(filename, 'rb')
if f_save is None:
f_save = self.f_save
f = open(f_save, 'rb')
tmp_dict = pickle.load(f)
f.close()
self.__dict__.update(tmp_dict)
Expand Down
83 changes: 70 additions & 13 deletions src/simulations/parametersweep.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import seaborn as sns
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=32)
import matplotlib.pyplot as plt

from mplh.color_utils import get_colors
from mplh.fig_utils import legend_from_color
Expand Down Expand Up @@ -67,10 +68,14 @@ def __init__(self, default_params_f, sweep_params_f):
self.default_params = params

# Create the yaml files in the directory indicated by local_outdir and prefix in sweep_params_f
self.outdir = os.path.join(sweep_params["local_outdir"], sweep_params["prefix"])
self.outdir = os.path.join(sweep_params["outdir"], sweep_params["prefix"])
if not os.path.exists(self.outdir):
os.makedirs(self.outdir)

self.f_save = \
os.path.join(self.outdir,self.sweep_params['prefix'] + '.p')
self.tmp_f_save = self.f_save.replace('.p', '') + '_tmp.p'

# Create a grid.
# Loop through and set each parameter in the the params grid parameter.
self.params_df = pd.DataFrame(list(ParameterGrid(sweep_params['grid'])))
Expand Down Expand Up @@ -119,7 +124,10 @@ def run_sweep(self, subset=None):
sim = FullSimulation(params_f)
sim.run()
sweep_results[f] = sim
self.sweep_results = sweep_results

# Save temp file for last track.
self.sweep_results = sweep_results
self.save(f_save=self.tmp_f_save)
return


Expand Down Expand Up @@ -162,46 +170,95 @@ def plot_sensitivity_and_dropout(self):
g.savefig(os.path.join(self.outdir, 'dropout.png'))
return None


def plot_ppv(self):
return


def cluster_before_after(self):
for f in self.sweep_results:
self.sweep_results[f].cluster()
self.sweep_results[f].cluster_before_after()
return


def plot_before_after_all_clones(self):
"""
Plot the growth of each clone.
Plot the growth of each clone. Boxplots with x-axis being
the growth rate for the dominant clone, the y-axis the
after/before ratio, the color the dominant clone cell size.
# Expand params_df to include each iteration, which will
# allow for violinplots. This will lead to
params_df.shape[0]*num_iterations rows.
:return:
"""
return
df_full = []
for ind, val in self.params_df.iterrows():
full_sim = self.sweep_results[ind]
b_a_df = full_sim.b_a_df
# Each element will be a list of the values.
s = self.params_df.loc[ind].copy()
curr = pd.DataFrame([s.copy()] * len(b_a_df))
curr["A/B"] = b_a_df["A/B"].values
df_full.append(curr)

df_full = pd.concat(df_full)
df_full = df_full.astype(
{'dominant_growth': str, 'dominant_clone_sizes': str,
'A/B': float})

sns.violinplot(data=df_full, y="A/B", hue="dominant_growth",
x="dominant_clone_sizes")
plt.savefig(os.path.join(self.outdir, 'growth_before_after.png'))
#g.savefig(os.path.join(self.outdir, 'precision.png'))
return None

def plot_before_after_true_growth(self):
"""
Plot the growth of cells in each clone before and after
Plot the growth of cells in each clone before and after.
Dotplot/heatmap, where each row is a het value, each column
is growth rate, and the value is dominant clone cells
growth (after/before). Can make multiple where each column is
coverage, each row is error rate.
:return:
"""
return

def plot_before_after_cluster_growth(self):
"""
Plot the growth of the predicted dominant clone based on clustering
results and picking the top clone.
Plot the growth of cells in each clone before and after.
Dotplot/heatmap, where each row is a het value, each column
is growth rate, and the value is dominant clone cell cluster
growth (after/before). Can make multiple where each column is
coverage, each row is error rate.
:return:
"""
return

def save(self):
f_save = os.path.join(self.outdir, self.sweep_params['prefix']+'.p')

def plot_error_cluster_growth(self):
"""
Plot the deviation of the growth rates for each pipeline run.
If A=true growth rate, B=predicted, (C = A-B)
:return:
"""

def delete_tmp(self):
if os.path.exists(self.tmp_f_save):
os.remove(self.tmp_f_save)

def save(self, f_save=None):
if f_save is None:
f_save = self.f_save
f = open(f_save, 'wb')
pickle.dump(self.__dict__, f, 2)
f.close()

def load(self, filename):
def load(self, f_save=None):
if f_save is None:
f_save = self.f_save
#filename = self.params['filename']
f = open(filename, 'rb')
f = open(f_save, 'rb')
tmp_dict = pickle.load(f)
f.close()
self.__dict__.update(tmp_dict)
7 changes: 6 additions & 1 deletion src/simulations/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ def combine_init_growth(self):
assert (combined_cell_af.shape[0] == self.cell_af.shape[0] +
self.subsample_new_cell_af.shape[0])
assert(combined_meta.shape[0] == clones.shape[0])
assert (combined_cell_af.shape[0] == clones.shape[0])
assert(combined_cell_af.shape[0] == clones.shape[0])
self.combined_meta = combined_meta
self.combined_clones = clones
self.combined_cell_af = combined_cell_af
Expand Down Expand Up @@ -361,7 +361,11 @@ def compare_before_after(self):
Creates a df that contains information on
the number of cells from each clone before as well as after.
:return:
df.at[ind, "Dominant Before"] = (full_sim.clone_cell == 1).sum()
df.at[ind, "Dominant After"] = (full_sim.subsample_new_clone_cell == 1).sum()
"""

return

def cluster_compare_before_after(self):
Expand Down Expand Up @@ -413,6 +417,7 @@ def cluster_kmeans(cell_af):




def main():
return

Expand Down
Loading

0 comments on commit 1b49ccc

Please sign in to comment.