Skip to content

Commit

Permalink
Add #20 kmeans feature with elbow method
Browse files Browse the repository at this point in the history
  • Loading branch information
Isaac Shamie committed Oct 30, 2020
1 parent 795e9d1 commit 99f99bf
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 782 deletions.
127 changes: 127 additions & 0 deletions src/simulations/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#import pymc3 as pm
import numpy as np
from numpy import random
import os
import pandas as pd
from tqdm import tqdm
#from src.config import ROOT_DIR
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import glob
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=32)

from mplh.color_utils import get_colors
from mplh.fig_utils import legend_from_color
from mplh import cluster_help as ch
from src.simulations.utils.config import read_config_file, write_config_file

from dynamicTreeCut import cutreeHybrid
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage
from sklearn.model_selection import ParameterGrid
from src.simulations.utils.config import check_required


class Analysis:
"""Analysis of lineage tracing experiments
Can look at both supervised (from simulation)- where the clones are
known, along with unsupervised, which is the experimental side.
:ivar params
:type params: dict
"""

def __init__(self):
return


def calculate_average_clone_mt(self):
return



@staticmethod
def cluster(cell_af):
"""Dynamic tree clustering of the rows of cell_af :param cell_af:
:return:
Args:
cell_af:
"""
distances = pdist(cell_af, "euclidean")
link = linkage(distances, "average")
clusters = cutreeHybrid(link, distances)['labels']
return clusters

@staticmethod
def cluster_kmeans(X, min_c=2, max_c=10):
"""
Args:
cell_af:
"""

best_n_clusters = -1
sil_score_max = -1 # this is the minimum possible score
for n_clusters in range(min_c, max_c):
model = KMeans(n_clusters=n_clusters, init='k-means++',
max_iter=100, n_init=1)
labels = model.fit_predict(X)
sil_score = silhouette_score(X, labels)
print(
"The average silhouette score for %i clusters is %0.2f" % (
n_clusters, sil_score))
if sil_score > sil_score_max:
sil_score_max = sil_score
best_n_clusters = n_clusters

assert(best_n_clusters >= 0)
model = KMeans(n_clusters=best_n_clusters, init='k-means++',
max_iter=100, n_init=1)
labels = model.fit_predict(X)

return labels



def compare_before_after(self):
"""Creates a df that contains information on the number of cells from
each clone before as well as after. :return: df.at[ind, "Dominant
Before"] = (full_sim.clone_cell == 1).sum() df.at[ind, "Dominant After"]
= (full_sim.subsample_new_clone_cell == 1).sum()
"""

return

def cluster_compare_before_after(self):
"""Compares the performance of clustering on grouping the same clones
together. :return:
"""
return

@staticmethod
def plot_cluster(cell_af, cell_meta=None, mt_meta=None, f_save=None):
"""
Args:
cell_af:
cell_meta:
mt_meta:
f_save:
"""
ch.plot_cluster(cell_af, row_meta=cell_meta, col_meta=mt_meta,
fsave=f_save, to_col_clust=False, to_z=True)



def main():
return


if "__name__" == "__main__":
main()

8 changes: 4 additions & 4 deletions src/simulations/fullsimulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,11 @@ def __init__(self, params_f):
self.f_save_data = os.path.join(self.data_outdir,
self.params['name'] + '.p')
self.f_save = os.path.join(self.outdir, self.params['name'] + '.p')
self.f_save_metrics = self.f_save.replace('.p', '.metrics.tsv')
self.f_save_cluster = self.f_save.replace('.p', '.cluster.tsv')
self.f_save_befaft = self.f_save.replace('.p', '.before_after.tsv')
self.f_save_rocs = self.f_save.replace('.p', '.rocs.p')

self.f_save_metrics = self.f_save_data.replace('.p', '.metrics.tsv')
self.f_save_cluster = self.f_save_data.replace('.p', '.cluster.tsv')
self.f_save_befaft = self.f_save_data.replace('.p', '.before_after.tsv')
self.f_save_rocs = self.f_save_data.replace('.p', '.rocs.p')

return
#for i in self.n_iter:
Expand Down
37 changes: 10 additions & 27 deletions src/simulations/parametersweep.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import pickle
import seaborn as sns
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=32)
import matplotlib.pyplot as plt

from mplh.color_utils import get_colors
Expand All @@ -19,13 +18,8 @@
from sklearn.model_selection import ParameterGrid
from src.simulations.utils.config import check_required


from .fullsimulation import FullSimulation

""" Run the simulation similar to in Extended Data Fig 3 from
Massively parallel single-cell mitochondrial DNA genotyping and chromatin profiling"""


def replace_item(obj, key, replace_value):
# https: // stackoverflow.com / a / 45335542
for k, v in obj.items():
Expand Down Expand Up @@ -66,14 +60,16 @@ def __init__(self, default_params_f, sweep_params_f):
self.default_params_f = default_params_f
self.sweep_params = sweep_params
self.default_params = params

self.save_sim = sweep_params['save_sim']
print(self.save_sim)
# Create the yaml files in the directory indicated by local_outdir and prefix in sweep_params_f
self.outdir = os.path.join(sweep_params["outdir"], sweep_params["prefix"])
if not os.path.exists(self.outdir):
os.makedirs(self.outdir)
self.f_save = \
os.path.join(self.outdir,self.sweep_params['prefix'] + '.p')
self.tmp_f_save = self.f_save.replace('.p', '') + '_tmp.p'

#self.tmp_f_save = self.f_save.replace('.p', '') + '_tmp.p'

self.data_outdir = os.path.join(sweep_params['data_outdir'], sweep_params["prefix"])
if not os.path.exists(self.data_outdir):
Expand All @@ -89,7 +85,7 @@ def __init__(self, default_params_f, sweep_params_f):
# Create the name
params['name'] = str(ind)
params['data_outdir'] = self.data_outdir
params['local_outdir'] = self.sweep_params["outdir"]
params['local_outdir'] = self.outdir
params['prefix'] = self.sweep_params["prefix"]
for name, v in val.iteritems():
# Set the specific variables that need to be updated
Expand All @@ -111,13 +107,15 @@ def __init__(self, default_params_f, sweep_params_f):


@staticmethod
def run_single_sweep(f, outdir):
def run_single_sweep(f, outdir, save=True):
print(f'Running file {f}')
params_f = os.path.join(outdir, str(f) + '.yaml')
sim = FullSimulation(params_f)
sim.run()
sim.run_metrics()
sim.save()

if save:
sim.save()
return sim.metrics

def run_sweep(self, subset=None):
Expand All @@ -138,25 +136,10 @@ def run_sweep(self, subset=None):
###
#sweep_results_df = sweep_results_df.apply(self.run_single_sweep, args=(self.outdir,))
pandarallel.initialize(nb_workers=self.sweep_params['cpus'])
sweep_results_df = sweep_results_df.parallel_apply(self.run_single_sweep, args=(self.outdir,))
sweep_results_df = sweep_results_df.parallel_apply(self.run_single_sweep, args=(self.outdir, self.save_sim))
###

self.sweep_results = sweep_results_df

# sweep_results = dict()
#
# for f, val in params_df.iterrows():
# params_f = os.path.join(self.outdir, str(f) +'.yaml')
# print(f"Running with file: {f}")
# sim = FullSimulation(params_f)
# sim.run()
# sim.run_metrics()
# sim.save()
#
# # Only store the current metrics, not the entire simulation.
# sweep_results[f] = sim.metrics
# #self.save(f_save=self.tmp_f_save)
# self.sweep_results = sweep_results
return


Expand Down
Loading

0 comments on commit 99f99bf

Please sign in to comment.