Add #20 kmeans feature with elbow method

LewisLabUCSD · Oct 30, 2020 · 99f99bf · 99f99bf
1 parent 795e9d1
commit 99f99bf
Show file tree

Hide file tree

Showing 6 changed files with 171 additions and 782 deletions.
diff --git a/src/simulations/analysis.py b/src/simulations/analysis.py
@@ -0,0 +1,127 @@
+#import pymc3 as pm
+import numpy as np
+from numpy import random
+import os
+import pandas as pd
+from tqdm import tqdm
+#from src.config import ROOT_DIR
+import matplotlib.pyplot as plt
+import pickle
+import seaborn as sns
+import glob
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from scipy.spatial.distance import cdist
+from pandarallel import pandarallel
+pandarallel.initialize(nb_workers=32)
+
+from mplh.color_utils import get_colors
+from mplh.fig_utils import legend_from_color
+from mplh import cluster_help as ch
+from src.simulations.utils.config import read_config_file, write_config_file
+
+from dynamicTreeCut import cutreeHybrid
+from scipy.spatial.distance import pdist
+from scipy.cluster.hierarchy import linkage
+from sklearn.model_selection import ParameterGrid
+from src.simulations.utils.config import check_required
+
+
+class Analysis:
+    """Analysis of lineage tracing experiments
+
+    Can look at both supervised (from simulation)- where the clones are
+    known, along with unsupervised, which is the experimental side.
+
+    :ivar params
+    :type params: dict
+    """
+
+    def __init__(self):
+        return
+
+
+    def calculate_average_clone_mt(self):
+        return
+
+
+
+    @staticmethod
+    def cluster(cell_af):
+        """Dynamic tree clustering of the rows of cell_af :param cell_af:
+        :return:
+
+        Args:
+            cell_af:
+        """
+        distances = pdist(cell_af, "euclidean")
+        link = linkage(distances, "average")
+        clusters = cutreeHybrid(link, distances)['labels']
+        return clusters
+
+    @staticmethod
+    def cluster_kmeans(X, min_c=2, max_c=10):
+        """
+        Args:
+            cell_af:
+        """
+
+        best_n_clusters = -1
+        sil_score_max = -1  # this is the minimum possible score
+        for n_clusters in range(min_c, max_c):
+            model = KMeans(n_clusters=n_clusters, init='k-means++',
+                           max_iter=100, n_init=1)
+            labels = model.fit_predict(X)
+            sil_score = silhouette_score(X, labels)
+            print(
+                "The average silhouette score for %i clusters is %0.2f" % (
+                n_clusters, sil_score))
+            if sil_score > sil_score_max:
+                sil_score_max = sil_score
+                best_n_clusters = n_clusters
+
+        assert(best_n_clusters >= 0)
+        model = KMeans(n_clusters=best_n_clusters, init='k-means++',
+                       max_iter=100, n_init=1)
+        labels = model.fit_predict(X)
+
+        return labels
+
+
+
+    def compare_before_after(self):
+        """Creates a df that contains information on the number of cells from
+        each clone before as well as after. :return: df.at[ind, "Dominant
+        Before"] = (full_sim.clone_cell == 1).sum() df.at[ind, "Dominant After"]
+        = (full_sim.subsample_new_clone_cell == 1).sum()
+        """
+
+        return
+
+    def cluster_compare_before_after(self):
+        """Compares the performance of clustering on grouping the same clones
+        together. :return:
+        """
+        return
+
+    @staticmethod
+    def plot_cluster(cell_af, cell_meta=None, mt_meta=None, f_save=None):
+        """
+        Args:
+            cell_af:
+            cell_meta:
+            mt_meta:
+            f_save:
+        """
+        ch.plot_cluster(cell_af, row_meta=cell_meta, col_meta=mt_meta,
+                        fsave=f_save, to_col_clust=False, to_z=True)
+
+
+
+def main():
+    return
+
+
+if "__name__" == "__main__":
+    main()
+
diff --git a/src/simulations/fullsimulation.py b/src/simulations/fullsimulation.py
@@ -60,11 +60,11 @@ def __init__(self, params_f):
         self.f_save_data = os.path.join(self.data_outdir,
                                    self.params['name'] + '.p')
         self.f_save = os.path.join(self.outdir, self.params['name'] + '.p')
-        self.f_save_metrics = self.f_save.replace('.p', '.metrics.tsv')
-        self.f_save_cluster = self.f_save.replace('.p', '.cluster.tsv')
-        self.f_save_befaft = self.f_save.replace('.p', '.before_after.tsv')
-        self.f_save_rocs = self.f_save.replace('.p', '.rocs.p')
 
+        self.f_save_metrics = self.f_save_data.replace('.p', '.metrics.tsv')
+        self.f_save_cluster = self.f_save_data.replace('.p', '.cluster.tsv')
+        self.f_save_befaft = self.f_save_data.replace('.p', '.before_after.tsv')
+        self.f_save_rocs = self.f_save_data.replace('.p', '.rocs.p')
 
         return
         #for i in self.n_iter:

diff --git a/src/simulations/parametersweep.py b/src/simulations/parametersweep.py
@@ -5,7 +5,6 @@
 import pickle
 import seaborn as sns
 from pandarallel import pandarallel
-pandarallel.initialize(nb_workers=32)
 import matplotlib.pyplot as plt
 
 from mplh.color_utils import get_colors
@@ -19,13 +18,8 @@
 from sklearn.model_selection import ParameterGrid
 from src.simulations.utils.config import check_required
 
-
 from .fullsimulation import FullSimulation
 
-""" Run the simulation similar to in Extended Data Fig 3 from 
-    Massively parallel single-cell mitochondrial DNA genotyping and chromatin profiling"""
-
-
 def replace_item(obj, key, replace_value):
     # https: // stackoverflow.com / a / 45335542
     for k, v in obj.items():
@@ -66,14 +60,16 @@ def __init__(self, default_params_f, sweep_params_f):
         self.default_params_f = default_params_f
         self.sweep_params = sweep_params
         self.default_params = params
-
+        self.save_sim = sweep_params['save_sim']
+        print(self.save_sim)
         # Create the yaml files in the directory indicated by local_outdir and prefix in sweep_params_f
         self.outdir = os.path.join(sweep_params["outdir"], sweep_params["prefix"])
         if not os.path.exists(self.outdir):
             os.makedirs(self.outdir)
         self.f_save = \
             os.path.join(self.outdir,self.sweep_params['prefix'] + '.p')
-        self.tmp_f_save = self.f_save.replace('.p', '') + '_tmp.p'
+
+        #self.tmp_f_save = self.f_save.replace('.p', '') + '_tmp.p'
 
         self.data_outdir = os.path.join(sweep_params['data_outdir'], sweep_params["prefix"])
         if not os.path.exists(self.data_outdir):
@@ -89,7 +85,7 @@ def __init__(self, default_params_f, sweep_params_f):
             # Create the name
             params['name'] = str(ind)
             params['data_outdir'] = self.data_outdir
-            params['local_outdir'] = self.sweep_params["outdir"]
+            params['local_outdir'] = self.outdir
             params['prefix'] = self.sweep_params["prefix"]
             for name, v in val.iteritems():
                 # Set the specific variables that need to be updated
@@ -111,13 +107,15 @@ def __init__(self, default_params_f, sweep_params_f):
 
 
     @staticmethod
-    def run_single_sweep(f, outdir):
+    def run_single_sweep(f, outdir, save=True):
         print(f'Running file {f}')
         params_f = os.path.join(outdir, str(f) + '.yaml')
         sim = FullSimulation(params_f)
         sim.run()
         sim.run_metrics()
-        sim.save()
+
+        if save:
+            sim.save()
         return sim.metrics
 
     def run_sweep(self, subset=None):
@@ -138,25 +136,10 @@ def run_sweep(self, subset=None):
         ###
         #sweep_results_df = sweep_results_df.apply(self.run_single_sweep, args=(self.outdir,))
         pandarallel.initialize(nb_workers=self.sweep_params['cpus'])
-        sweep_results_df = sweep_results_df.parallel_apply(self.run_single_sweep, args=(self.outdir,))
+        sweep_results_df = sweep_results_df.parallel_apply(self.run_single_sweep, args=(self.outdir, self.save_sim))
         ###
 
         self.sweep_results = sweep_results_df
-
-        # sweep_results = dict()
-        #
-        # for f, val in params_df.iterrows():
-        #     params_f = os.path.join(self.outdir, str(f) +'.yaml')
-        #     print(f"Running with file: {f}")
-        #     sim = FullSimulation(params_f)
-        #     sim.run()
-        #     sim.run_metrics()
-        #     sim.save()
-        #
-        #     # Only store the current metrics, not the entire simulation.
-        #     sweep_results[f] = sim.metrics
-        #     #self.save(f_save=self.tmp_f_save)
-        # self.sweep_results = sweep_results
         return