From 99f99bf9939bf55ea50ffc0f5041d71df4a7583f Mon Sep 17 00:00:00 2001 From: Isaac Shamie Date: Fri, 30 Oct 2020 02:11:25 +0000 Subject: [PATCH] Add #20 kmeans feature with elbow method --- src/simulations/analysis.py | 127 ++++ src/simulations/fullsimulation.py | 8 +- src/simulations/parametersweep.py | 37 +- src/simulations/pipeline.py | 680 ------------------ src/simulations/simulation.py | 71 -- .../test_simulation_cluster_kmeans.py | 30 + 6 files changed, 171 insertions(+), 782 deletions(-) create mode 100644 src/simulations/analysis.py delete mode 100644 src/simulations/pipeline.py create mode 100644 src/tests/simulations/test_simulation_cluster_kmeans.py diff --git a/src/simulations/analysis.py b/src/simulations/analysis.py new file mode 100644 index 00000000..e5e62a27 --- /dev/null +++ b/src/simulations/analysis.py @@ -0,0 +1,127 @@ +#import pymc3 as pm +import numpy as np +from numpy import random +import os +import pandas as pd +from tqdm import tqdm +#from src.config import ROOT_DIR +import matplotlib.pyplot as plt +import pickle +import seaborn as sns +import glob +from sklearn.cluster import KMeans +from sklearn.metrics import silhouette_score +from scipy.spatial.distance import cdist +from pandarallel import pandarallel +pandarallel.initialize(nb_workers=32) + +from mplh.color_utils import get_colors +from mplh.fig_utils import legend_from_color +from mplh import cluster_help as ch +from src.simulations.utils.config import read_config_file, write_config_file + +from dynamicTreeCut import cutreeHybrid +from scipy.spatial.distance import pdist +from scipy.cluster.hierarchy import linkage +from sklearn.model_selection import ParameterGrid +from src.simulations.utils.config import check_required + + +class Analysis: + """Analysis of lineage tracing experiments + + Can look at both supervised (from simulation)- where the clones are + known, along with unsupervised, which is the experimental side. + + :ivar params + :type params: dict + """ + + def __init__(self): + return + + + def calculate_average_clone_mt(self): + return + + + + @staticmethod + def cluster(cell_af): + """Dynamic tree clustering of the rows of cell_af :param cell_af: + :return: + + Args: + cell_af: + """ + distances = pdist(cell_af, "euclidean") + link = linkage(distances, "average") + clusters = cutreeHybrid(link, distances)['labels'] + return clusters + + @staticmethod + def cluster_kmeans(X, min_c=2, max_c=10): + """ + Args: + cell_af: + """ + + best_n_clusters = -1 + sil_score_max = -1 # this is the minimum possible score + for n_clusters in range(min_c, max_c): + model = KMeans(n_clusters=n_clusters, init='k-means++', + max_iter=100, n_init=1) + labels = model.fit_predict(X) + sil_score = silhouette_score(X, labels) + print( + "The average silhouette score for %i clusters is %0.2f" % ( + n_clusters, sil_score)) + if sil_score > sil_score_max: + sil_score_max = sil_score + best_n_clusters = n_clusters + + assert(best_n_clusters >= 0) + model = KMeans(n_clusters=best_n_clusters, init='k-means++', + max_iter=100, n_init=1) + labels = model.fit_predict(X) + + return labels + + + + def compare_before_after(self): + """Creates a df that contains information on the number of cells from + each clone before as well as after. :return: df.at[ind, "Dominant + Before"] = (full_sim.clone_cell == 1).sum() df.at[ind, "Dominant After"] + = (full_sim.subsample_new_clone_cell == 1).sum() + """ + + return + + def cluster_compare_before_after(self): + """Compares the performance of clustering on grouping the same clones + together. :return: + """ + return + + @staticmethod + def plot_cluster(cell_af, cell_meta=None, mt_meta=None, f_save=None): + """ + Args: + cell_af: + cell_meta: + mt_meta: + f_save: + """ + ch.plot_cluster(cell_af, row_meta=cell_meta, col_meta=mt_meta, + fsave=f_save, to_col_clust=False, to_z=True) + + + +def main(): + return + + +if "__name__" == "__main__": + main() + diff --git a/src/simulations/fullsimulation.py b/src/simulations/fullsimulation.py index 7d27ba3f..21b9b574 100644 --- a/src/simulations/fullsimulation.py +++ b/src/simulations/fullsimulation.py @@ -60,11 +60,11 @@ def __init__(self, params_f): self.f_save_data = os.path.join(self.data_outdir, self.params['name'] + '.p') self.f_save = os.path.join(self.outdir, self.params['name'] + '.p') - self.f_save_metrics = self.f_save.replace('.p', '.metrics.tsv') - self.f_save_cluster = self.f_save.replace('.p', '.cluster.tsv') - self.f_save_befaft = self.f_save.replace('.p', '.before_after.tsv') - self.f_save_rocs = self.f_save.replace('.p', '.rocs.p') + self.f_save_metrics = self.f_save_data.replace('.p', '.metrics.tsv') + self.f_save_cluster = self.f_save_data.replace('.p', '.cluster.tsv') + self.f_save_befaft = self.f_save_data.replace('.p', '.before_after.tsv') + self.f_save_rocs = self.f_save_data.replace('.p', '.rocs.p') return #for i in self.n_iter: diff --git a/src/simulations/parametersweep.py b/src/simulations/parametersweep.py index d2fefc82..13b11915 100644 --- a/src/simulations/parametersweep.py +++ b/src/simulations/parametersweep.py @@ -5,7 +5,6 @@ import pickle import seaborn as sns from pandarallel import pandarallel -pandarallel.initialize(nb_workers=32) import matplotlib.pyplot as plt from mplh.color_utils import get_colors @@ -19,13 +18,8 @@ from sklearn.model_selection import ParameterGrid from src.simulations.utils.config import check_required - from .fullsimulation import FullSimulation -""" Run the simulation similar to in Extended Data Fig 3 from - Massively parallel single-cell mitochondrial DNA genotyping and chromatin profiling""" - - def replace_item(obj, key, replace_value): # https: // stackoverflow.com / a / 45335542 for k, v in obj.items(): @@ -66,14 +60,16 @@ def __init__(self, default_params_f, sweep_params_f): self.default_params_f = default_params_f self.sweep_params = sweep_params self.default_params = params - + self.save_sim = sweep_params['save_sim'] + print(self.save_sim) # Create the yaml files in the directory indicated by local_outdir and prefix in sweep_params_f self.outdir = os.path.join(sweep_params["outdir"], sweep_params["prefix"]) if not os.path.exists(self.outdir): os.makedirs(self.outdir) self.f_save = \ os.path.join(self.outdir,self.sweep_params['prefix'] + '.p') - self.tmp_f_save = self.f_save.replace('.p', '') + '_tmp.p' + + #self.tmp_f_save = self.f_save.replace('.p', '') + '_tmp.p' self.data_outdir = os.path.join(sweep_params['data_outdir'], sweep_params["prefix"]) if not os.path.exists(self.data_outdir): @@ -89,7 +85,7 @@ def __init__(self, default_params_f, sweep_params_f): # Create the name params['name'] = str(ind) params['data_outdir'] = self.data_outdir - params['local_outdir'] = self.sweep_params["outdir"] + params['local_outdir'] = self.outdir params['prefix'] = self.sweep_params["prefix"] for name, v in val.iteritems(): # Set the specific variables that need to be updated @@ -111,13 +107,15 @@ def __init__(self, default_params_f, sweep_params_f): @staticmethod - def run_single_sweep(f, outdir): + def run_single_sweep(f, outdir, save=True): print(f'Running file {f}') params_f = os.path.join(outdir, str(f) + '.yaml') sim = FullSimulation(params_f) sim.run() sim.run_metrics() - sim.save() + + if save: + sim.save() return sim.metrics def run_sweep(self, subset=None): @@ -138,25 +136,10 @@ def run_sweep(self, subset=None): ### #sweep_results_df = sweep_results_df.apply(self.run_single_sweep, args=(self.outdir,)) pandarallel.initialize(nb_workers=self.sweep_params['cpus']) - sweep_results_df = sweep_results_df.parallel_apply(self.run_single_sweep, args=(self.outdir,)) + sweep_results_df = sweep_results_df.parallel_apply(self.run_single_sweep, args=(self.outdir, self.save_sim)) ### self.sweep_results = sweep_results_df - - # sweep_results = dict() - # - # for f, val in params_df.iterrows(): - # params_f = os.path.join(self.outdir, str(f) +'.yaml') - # print(f"Running with file: {f}") - # sim = FullSimulation(params_f) - # sim.run() - # sim.run_metrics() - # sim.save() - # - # # Only store the current metrics, not the entire simulation. - # sweep_results[f] = sim.metrics - # #self.save(f_save=self.tmp_f_save) - # self.sweep_results = sweep_results return diff --git a/src/simulations/pipeline.py b/src/simulations/pipeline.py deleted file mode 100644 index 4cd5b82a..00000000 --- a/src/simulations/pipeline.py +++ /dev/null @@ -1,680 +0,0 @@ -#import pymc3 as pm -import numpy as np -from numpy import random -import os -import pandas as pd -from tqdm import tqdm -#from src.config import ROOT_DIR -from sklearn.metrics import roc_curve, average_precision_score -from scipy import interp -import matplotlib.pyplot as plt -import pickle -import seaborn as sns -import glob -from sklearn.cluster import KMeans -from sklearn import metrics -from scipy.spatial.distance import cdist -from pandarallel import pandarallel -pandarallel.initialize(nb_workers=32) - -from mplh.color_utils import get_colors -from mplh.fig_utils import legend_from_color -from mplh import cluster_help as ch -from src.simulations.utils.config import read_config_file, write_config_file - -from dynamicTreeCut import cutreeHybrid -from scipy.spatial.distance import pdist -from scipy.cluster.hierarchy import linkage -from sklearn.model_selection import ParameterGrid -from src.simulations.utils.config import check_required - -""" Run the simulation similar to in Extended Data Fig 3 from - Massively parallel single-cell mitochondrial DNA genotyping and chromatin profiling""" - - -def replace_item(obj, key, replace_value): - # https: // stackoverflow.com / a / 45335542 - for k, v in obj.items(): - if isinstance(v, dict): - obj[k] = replace_item(v, key, replace_value) - if key in obj: - obj[key] = replace_value - return obj - - -class ParameterSweep: - """ - A class for running a lineage simulation with varying parameters. - - :ivar outdir: Directory to save all output to - :ivar sweep_params: The hyperparameters dictionary used - :ivar default_params: The baseline parameters used. - :ivar metrics: pd DataFrame that contains output metrics - """ - def __init__(self, default_params_f, sweep_params_f): - """ - - Initialize ParameterSweep creates the pipeline yaml files to be - run with run_sweep. - :param default_params_f: File that contains the baseline - parameters for the simulation that all runs will use. - Grid is the key in the parameter file that contains the - hyperparameters. - :type default_params_f: str - :param sweep_params_f: File that contains the hyperparameters to - do a grid of all parameters to run the pipeline on. - :type str - - :attributes: - - - """ - self.sweep_params_f = sweep_params_f - self.default_params_f = default_params_f - params = read_config_file(default_params_f) - sweep_params = read_config_file(sweep_params_f) - self.sweep_params = sweep_params - self.default_params = params - - # Create the yaml files in the directory indicated by local_outdir and prefix in sweep_params_f - self.outdir = os.path.join(sweep_params["local_outdir"], sweep_params["prefix"]) - if not os.path.exists(self.outdir): - os.makedirs(self.outdir) - - # Create a grid. - # Loop through and set each parameter in the the params grid parameter. - self.params_df = pd.DataFrame(list(ParameterGrid(sweep_params['grid']))) - params_dict = dict() - - for ind, val in self.params_df.iterrows(): - f_name = os.path.join(self.outdir, str(ind)+'.yaml' ) - for name, v in val.iteritems(): - # Set the specific variables that need to be updated - if name == 'dominant_clone_sizes': - params["initialize"]["clone_sizes"][1] = v - # Set the non-clone such that all clones sum to 1 - params["initialize"]["clone_sizes"][0] = 1-sum(params["initialize"]["clone_sizes"][1:]) - assert(np.abs(sum(params["initialize"]["clone_sizes"]) - 1)<0.0001) - elif name == 'dominant_het': - params["het"][0] = v - elif name == 'dominant_growth': - params["growth"]["binomial"]["rates"][1] = v - else: # These parameters assumed to have the same name - # In both files - params = replace_item(params, name, v) - write_config_file(f_name, params) - params_dict[f_name] = params - return - - def run_sweep(self, subset=None): - """ - Loops through self.params_df and runs the simulation on that - parameter. - - :param subset: Which files to run the simulation. If list, - list of files to run on, if int, number of files to randomly - choose. If None, run on all. - :type subset: int or None or list (default=None) - - """ - - params_df = self.params_df - if isinstance(subset, int): - params_df = params_df.sample(n=subset) - - sweep_results = dict() - for f, val in params_df.iterrows(): - params_f = os.path.join(self.outdir, str(f) +'.yaml') - print(f"Running with file: {f}") - sim = FullSimulation(params_f) - sim.run() - sweep_results[f] = sim - self.sweep_results = sweep_results - return - - - def plot_sensitivity_and_dropout(self): - """ - Scatterplots of the average precision score and dropout - against variant heteroplasmy. - - - Additional simulation parameters used for groupings are - coverage (color), and error rate (column). - Uses sklearn's average_precision_score to get precision. - For dropout, estimates how many times the counts are 0 in the - clone mitochondrial variant. - """ - - metrics = self.params_df.copy() - metrics['Avg. Precision'] = -1 - metrics['% dropout'] = -1 - - # Add these results to self.results, which has the meta information too - for ind, val in self.params_df.iterrows(): - full_sim = self.sweep_results[ind] - dropout = full_sim.dropout - prec_scores = full_sim.prec_scores - rocs = full_sim.rocs - metrics.at[ind, 'Avg. Precision'] = np.mean(prec_scores) - metrics.at[ind, '% dropout'] = np.mean(dropout) - - self.metrics = metrics - # colors, _ = get_colors("categorical", names=coverages, - # n_colors=len(coverages)) - # Seaborn Factorplot - g = sns.FacetGrid(data=metrics, col="het_err_rate", hue="cov_constant") - g.map_dataframe(sns.scatterplot, x="dominant_het", y="Avg. Precision") - g.add_legend() - g.savefig(os.path.join(self.outdir,'precision.png')) - - - g = sns.FacetGrid(data=metrics, col="het_err_rate", hue="cov_constant") - g.map_dataframe(sns.scatterplot, x="dominant_het", y="% dropout") - g.add_legend() - g.savefig(os.path.join(self.outdir, 'dropout.png')) - return - - - def plot_ppv(self): - return - - def cluster_before_after(self): - for f in self.sweep_results: - self.sweep_results[f].cluster() - return - - - def save(self): - f_save = os.path.join(self.outdir, self.sweep_params['prefix']+'.p') - f = open(f_save, 'wb') - pickle.dump(self.__dict__, f, 2) - f.close() - - def load(self, filename): - #filename = self.params['filename'] - f = open(filename, 'rb') - tmp_dict = pickle.load(f) - f.close() - self.__dict__.update(tmp_dict) - - - -# I can make each variable a class? -# Does this ruin running the MCMC? I don't think so, b/c that format is going to be put in after anyway -class FullSimulation: - def __init__(self, params_f): - params = read_config_file(params_f) - self.n_iter = params['num_iterations'] - self.num_cells = params['num_cells'] - self.params = params - return - #for i in self.n_iter: - - def run(self): - # Parallelize df - df = pd.Series(index=range(self.n_iter)) - df = df.apply(self.run_sim, args=(self.params,)) - #df = df.parallel_apply(self.run_sim, args=(self.params,)) - - self.sim = df - #self.cluster_before_after() - self.sim_performance_dominant(group='both') - return - - @staticmethod - def run_sim(x, params): - s = Simulation(params) - s.initialize() - s.grow() - s.subsample_new(to_delete=True) - s.combine_init_growth() - return s - - def flatten_sim(self): - ## TODO - # This will extract out the classes of df - return - - def sim_performance_dominant(self, group='both'): - """ - Will average metrics over simulations. - :param group: {'init', 'growth', 'both'} This will indicate to group by - :return: - """ - dropout = [] - rocs = [] - prec_scores = [] - - for iter, s in enumerate(self.sim.values): - # First get the dominant clone , which is indexed as 1 - mt_pos = s.clone_mt_dict[1] - # TODO account for mt_pos being a list not an int - if group == 'init': - clones = s.clone_cell - cell_af = s.cell_af.loc[:,mt_pos] - elif group == 'growth': - clones = s.new_clone_cell - cell_af = s.new_cell_af.loc[:,mt_pos] - elif group == 'both': - #clones = pd.concat((s.clone_cell, s.subsample_new_clone_cell)).reset_index(drop=True) - #cell_af = pd.concat((s.cell_af.loc[:,mt_pos], s.subsample_new_cell_af.loc[:,mt_pos])).reset_index(drop=True) - clones = s.combined_clones - cell_af = s.combined_cell_af.loc[:,mt_pos] - else: - raise ValueError('group variable not properly set.') - - y_true = clones.values.copy() - y_true[y_true != 1] = 0 # Set nondominant clones to 0 - rocs.append(roc_curve(y_true, cell_af)) - prec_scores.append(average_precision_score(y_true, cell_af)) - dropout.append((cell_af[clones==1]==0).sum()/cell_af.shape[0]) - - - - self.dropout = dropout - self.prec_scores = prec_scores - self.rocs = rocs - return - - - def reduce_cells(self, cell_af): - #self.sim - return - - - def cluster_before_after(self): - cluster_results = [] - print('clustering') - for s in tqdm(self.sim.values): - cluster_results.append(s.cluster(s.combined_cell_af)) - print(len(cluster_results[-1])) - self.cluster_results = cluster_results - return - - - - - def save(self, f_save=None): - if f_save is None: - f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p') - f = open(f_save, 'wb') - pickle.dump(self.__dict__, f, 2) - f.close() - - def load(self, filename): - #filename = self.params['filename'] - f = open(filename, 'rb') - tmp_dict = pickle.load(f) - f.close() - self.__dict__.update(tmp_dict) - - -class Simulation: - def __init__(self, params_f): - if isinstance(params_f, str): - params = read_config_file(params_f) - else: - params = params_f - - self.params = params - check_required(params, ['initialize', 'num_cells', 'num_mt_positions', 'prefix']) - self.prefix = params['prefix'] - self.num_mt_positions = params['num_mt_positions'] - self.num_cells = params['num_cells'] - if not os.path.exists(params['local_outdir']): - os.mkdir(params['local_outdir']) - #should be external method - def initialize(self): - self.init_clone_dict() - self.init_cell_coverage() - self.init_cell_af() - #self.init_clone_mt() - - #should be external method - def grow(self): - p = self.params - type = p["growth"]["type"] - if type == "poisson": - self.grow_poisson(p['growth']['poisson']) - elif type == "binomial": - self.grow_binomial(p['growth']['binomial']) - return - - # Static Method - @staticmethod - def clone_counts_to_cell_series(clone_counts): - clone_counts = np.array(clone_counts) - num_cells = clone_counts.sum() - clone_cell = -1 * np.ones(shape=[num_cells, ]) - - - clone_cell[:clone_counts[0]] = 0 - for ind, val in enumerate(clone_counts[1:]): - start = clone_counts[:ind + 1].sum() - end = clone_counts[:ind + 1].sum() + val - # starts at sum(clone_counts[i-1]) ends at clone_counts[i].sum() - clone_cell[start:end] = ind + 1 - - clone_cell = pd.Series(clone_cell, dtype=int) - return clone_cell - - def init_clone_dict(self): - ### Add in potential to overwrite the values - - # Gets the clone dictionary. Should also have clone to mt dict. - clones = self.params['initialize']['clone_sizes'] - num_cells = self.num_cells - - # Option 1: List of fraction of size of each clone. 0s are nonclone size, listed first - if type(clones) == list: - #clone_cell = pd.Series(index=range(num_cells)) - clone_counts = np.random.multinomial(num_cells, clones) - clone_cell = self.clone_counts_to_cell_series(clone_counts) - self.clone_cell = clone_cell - # Option 2: 1 clone. ID'd as 1 - elif type(clones) == int: #One number for dominant clone. the others are not. - clone_cell = np.zeros(shape=[num_cells,]) - clone_cell[:num_cells] = 1 - clone_cell = clone_cell[::-1] - clone_cell = pd.Series(clone_cell, dtype=int) - self.clone_cell = clone_cell - - # Option 3 To ADD, beta binomial and more complex distributions - - self.num_clones = len(set(clone_cell.values))-1 # Remove the non-clone - return clone_cell - - - def init_cell_coverage(self): - """ - There are different modes to the coverage, either a constant or through a distribution. - :return: - """ - p = self.params['initialize']['coverage'] - type = p['type'] - - num_cells = self.num_cells - num_pos = self.num_mt_positions - c = np.zeros([num_cells, num_pos]) - - if type == 'constant': - c[:, :] = p['cov_constant'] - elif type == "poisson": - # Get the number of coverage per cell based on poisson (should be reads) - mu_cov_per_cell = p['mu_cov_per_cell'] - num_reads_per_cell = random.poisson(lam=mu_cov_per_cell, - size=num_cells) - - # Number of reads at each position, based on the average for each cell - for i in num_cells: - c[i, :] = random.poisson(num_reads_per_cell[i], - size=num_pos) - self.cells_mt_coverage = c - return c - - - def init_cell_af(self): - """ - Initialize the cell-by-mtPos af dataframe. Unless a clone:mt dict was provided, - the first N MT positions will be the clone AFs. - Creates self.clone_mt_dict and self.cell_af""" - - p = self.params['initialize'] - - hets = self.params['het'] - q = self.params['het_err_rate'] - clone_df = self.clone_cell - num_clones = self.num_clones - n_cells = self.num_cells - n_mt = self.num_mt_positions - - # Output - cell_af = pd.DataFrame(np.zeros(shape=[n_cells, n_mt])) - - - if 'mt_clone_map' in p and p['mt_clone_map'] is not None: - self.clone_mt_dict = p['mt_clone_map'] - else: - # Each clone points to a mt position - self.clone_mt_dict = dict() - for i in range(1,num_clones+1): - self.clone_mt_dict[i] = i - - # TODO Add the MT clone map so it can contain multiple mutants in lineages - - # If there is a heteroplasmy table in params, it is list of mutant heteroplasmy AFs. - # If not, will randomly draw based on number of clones - if type(hets) == list: - if (len(hets) != num_clones): - print('here') - assert(len(hets) == num_clones) - - ## Loop through each clone, - ## Generate the AF for the clone and non-clones using coverage for each cell - ## Fill in cell_by_af for that position. - for ind in range(1, num_clones+1): - # Generate AF: (clone_df == ind).sum() - n_dom_cells = (clone_df==ind).sum() - het = hets[ind-1] - - curr_mt = self.clone_mt_dict[ind] - - - if p['coverage']['type'] == 'constant': - c = p['coverage']['cov_constant'] - - af_i = random.binomial(c, het, - n_dom_cells) / c - af_j = random.binomial(c, q, - n_cells - n_dom_cells) / c - - # Update the dom_cells and non_dom for the current MT - cell_af.loc[np.flatnonzero(clone_df == ind), curr_mt] = af_i - cell_af.loc[np.flatnonzero(clone_df != ind), curr_mt] = af_j - - # Each cell and position has it's own coverage value, so need to update each - else: - c = self.cells_mt_coverage - #Get the cells coverage for the mt position - curr_mt_cov= c[:, curr_mt] - - # Get cell indicies for the clones and nonclones - curr_clone_inds = np.flatnonzero(clone_df==ind) - curr_nonclone_inds = np.flatnonzero(clone_df!=ind) - for cell in curr_clone_inds: - # Get one value for curr_mt and cell based on coverage - cell_af.loc[cell, curr_mt] = random.binomial(curr_mt_cov[cell], het) - for cell in curr_nonclone_inds: - cell_af.loc[cell, curr_mt] = random.binomial(curr_mt_cov[cell], q) - # Loop through each coverage - #for c in n_dom_cells: - - ##### - # TODO - # Add noise to the other non-lineage positions - ##### - self.cell_af = cell_af - return - - - def init_clone_mt(self): - p = self.params - if p["initialize"]['type'] == 'growth': - ## TODO - # Create a phylogeny and then get the averages of the mutants - self.average_clone_mt() - # If not growth, should aready be there. - return - - def average_clone_mt(self): - return - - def extract_clone_cells(self, clone_id): - ids = np.flatnonzero(self.clone_cell == clone_id) - return ids - - def simulate_expand_cells_af(self, af, growth_inds, sigma): - """ - Given a cell-by-af vector, expand the AF - :param af: - :param growth: - :param sigma: - :return: - """ - - new_af = af.iloc[growth_inds].copy() + random.normal(0, sigma, size=af.iloc[growth_inds].shape) - new_af.index = np.arange(af.index[-1]+1, af.index[-1]+1+new_af.shape[0]) - new_af = pd.concat((af,new_af), axis=0) - #new_af = np.append(af, np.concatenate(new_af)) - - return new_af - - def grow_binomial(self, p): - timesteps = p["time_steps"] - rates = p["rates"] - - sigma = self.params['growth']["mutant_af_sigma_noise"] - cell_af = self.cell_af - clone_mt_dict = self.clone_mt_dict - - num_clones = self.num_clones+1 - new_dict = {} - for curr_clone in range(num_clones): - curr_rate = rates[curr_clone] - ids = self.extract_clone_cells(curr_clone) - new_cells = cell_af.loc[ids].copy() - for i in range(timesteps): - # Simulate growth for each clone separately. - growth_inds = np.flatnonzero(random.binomial(1, curr_rate, size=new_cells.shape[0])) - #new_ids = - new_cells = self.simulate_expand_cells_af(new_cells, growth_inds, sigma) - - new_dict[curr_clone] = new_cells - # Create list of cells - - ####TODO - ## new_lineage_mutants chances. This will see if a mutation will change - - - ####TODO - ## Add death + stimulation rate as well as growth - # Save the new cell clones df and cell af - clone_counts = [i.shape[0] for i in new_dict.values()] - self.new_clone_cell = self.clone_counts_to_cell_series(clone_counts) - - self.new_cell_af = pd.DataFrame(new_dict[0]) - for clone in range(1, self.num_clones+1): - self.new_cell_af = pd.concat((self.new_cell_af, new_dict[clone]),axis=0).reset_index(drop=True) - return - - - def grow_poisson(self): - # TODO growth of poisson refactor - return - - - def subsample_new(self, to_delete=False): - new_cell_af = self.new_cell_af - p = self.params - if 'sequence_subsample' in p and p['sequence_subsample'] is not None: - self.subsample_new_cell_af = new_cell_af.sample(n=self.params['sequence_subsample']) - else: - self.subsample_new_cell_af = new_cell_af.sample(n=self.num_cells) - - self.subsample_new_clone_cell = self.new_clone_cell.loc[ - self.subsample_new_cell_af.index] - - if to_delete: - self.new_cell_af = None - self.new_clone_cell = None - - - def combine_init_growth(self): - clones = pd.concat( - (self.clone_cell, self.subsample_new_clone_cell)).reset_index( - drop=True) - combined_cell_af = self.cell_af.append(self.subsample_new_cell_af).reset_index(drop=True) - - combined_meta = np.concatenate((np.ones(shape=[self.cell_af.shape[0],]), np.zeros(shape=[self.subsample_new_cell_af.shape[0]]))) - combined_meta = pd.Series(combined_meta, name='After Growth', dtype=int) - assert(combined_meta.shape[0] == self.cell_af.shape[0]+self.subsample_new_cell_af.shape[0]) - assert (combined_cell_af.shape[0] == self.cell_af.shape[0] + - self.subsample_new_cell_af.shape[0]) - assert(combined_meta.shape[0] == clones.shape[0]) - assert (combined_cell_af.shape[0] == clones.shape[0]) - self.combined_meta = combined_meta - self.combined_clones = clones - self.combined_cell_af = combined_cell_af - return - - def save(self, f_save=None): - if f_save is None: - f_save = os.path.join(self.params['local_outdir'], self.params['prefix']+'.p') - f = open(f_save, 'wb') - pickle.dump(self.__dict__, f, 2) - f.close() - - def save_to_mgatk_format(self): - """ - Converts into the proper files needed for mgatk. (i.e variant and coverage files) - :return: - """ - - def load(self): - filename = self.params['filename'] - f = open(filename, 'rb') - tmp_dict = pickle.load(f) - f.close() - self.__dict__.update(tmp_dict) - - def compare_before_after(self): - return - - @staticmethod - def plot_cluster(cell_af, cell_meta=None, mt_meta=None, f_save=None): - ch.plot_cluster(cell_af, row_meta=cell_meta, col_meta=mt_meta, - fsave=f_save, to_col_clust=False, to_z=True) - - @staticmethod - def cluster(cell_af): - """ - Dynamic tree clustering of the rows of cell_af - :param cell_af: - :return: - """ - distances = pdist(cell_af, "euclidean") - link = linkage(distances, "average") - clusters = cutreeHybrid(link, distances)['labels'] - return clusters - - @staticmethod - def cluster_kmeans(cell_af): - distortions = [] - inertias = [] - mapping1 = {} - mapping2 = {} - K = range(1, 10) - - for k in K: - # Building and fitting the model - kmeanModel = KMeans(n_clusters=k).fit(cell_af) - kmeanModel.fit(cell_af) - - distortions.append(sum( - np.min(cdist(cell_af, kmeanModel.cluster_centers_, 'euclidean'), - axis=1)) / cell_af.shape[0]) - inertias.append(kmeanModel.inertia_) - - mapping1[k] = sum( - np.min(cdist(cell_af, kmeanModel.cluster_centers_, 'euclidean'), - axis=1)) / cell_af.shape[0] - mapping2[k] = kmeanModel.inertia_ - - -def main(): - return - - -if "__name__" == "__main__": - main() \ No newline at end of file diff --git a/src/simulations/simulation.py b/src/simulations/simulation.py index f6c81d73..43ea84f5 100644 --- a/src/simulations/simulation.py +++ b/src/simulations/simulation.py @@ -3,27 +3,8 @@ from numpy import random import os import pandas as pd -from tqdm import tqdm -#from src.config import ROOT_DIR -import matplotlib.pyplot as plt import pickle -import seaborn as sns -import glob -from sklearn.cluster import KMeans -from sklearn import metrics -from scipy.spatial.distance import cdist -from pandarallel import pandarallel -pandarallel.initialize(nb_workers=32) - -from mplh.color_utils import get_colors -from mplh.fig_utils import legend_from_color -from mplh import cluster_help as ch from src.simulations.utils.config import read_config_file, write_config_file - -from dynamicTreeCut import cutreeHybrid -from scipy.spatial.distance import pdist -from scipy.cluster.hierarchy import linkage -from sklearn.model_selection import ParameterGrid from src.simulations.utils.config import check_required @@ -397,58 +378,6 @@ def cluster_compare_before_after(self): """ return - @staticmethod - def plot_cluster(cell_af, cell_meta=None, mt_meta=None, f_save=None): - """ - Args: - cell_af: - cell_meta: - mt_meta: - f_save: - """ - ch.plot_cluster(cell_af, row_meta=cell_meta, col_meta=mt_meta, - fsave=f_save, to_col_clust=False, to_z=True) - - @staticmethod - def cluster(cell_af): - """Dynamic tree clustering of the rows of cell_af :param cell_af: - :return: - - Args: - cell_af: - """ - distances = pdist(cell_af, "euclidean") - link = linkage(distances, "average") - clusters = cutreeHybrid(link, distances)['labels'] - return clusters - - @staticmethod - def cluster_kmeans(cell_af): - """ - Args: - cell_af: - """ - distortions = [] - inertias = [] - mapping1 = {} - mapping2 = {} - K = range(1, 10) - for k in K: - # Building and fitting the model - kmeanModel = KMeans(n_clusters=k).fit(cell_af) - kmeanModel.fit(cell_af) - - distortions.append(sum( - np.min(cdist(cell_af, kmeanModel.cluster_centers_, 'euclidean'), - axis=1)) / cell_af.shape[0]) - inertias.append(kmeanModel.inertia_) - - mapping1[k] = sum( - np.min(cdist(cell_af, kmeanModel.cluster_centers_, 'euclidean'), - axis=1)) / cell_af.shape[0] - mapping2[k] = kmeanModel.inertia_ - - def main(): return diff --git a/src/tests/simulations/test_simulation_cluster_kmeans.py b/src/tests/simulations/test_simulation_cluster_kmeans.py new file mode 100644 index 00000000..43c3a1c4 --- /dev/null +++ b/src/tests/simulations/test_simulation_cluster_kmeans.py @@ -0,0 +1,30 @@ +from src.simulations import Simulation +from src.simulations.analysis import Analysis +from src.simulations.utils.config import read_config_file +from src.config import RESULTS, ROOT_DIR +import time +from os.path import join +import os +import unittest + + +os.chdir(RESULTS) +params = os.path.join(ROOT_DIR, 'parameters/simulations/test_simple.yaml') +#params = read_config_file(params) + +class TestSum(unittest.TestCase): + def test_oneIter(self): + s = Simulation(params) + s.initialize() + s.grow() + s.subsample_new(to_delete=False) + + an = Analysis() + an.cluster_kmeans(s.subsample_new_cell_af) + + + return + +if __name__ == '__main__': + unittest.main() +