Update #20 kmeans and automated elbow method (with option to pre-defi…

…ne which cluster for speed)
LewisLabUCSD · Nov 1, 2020 · bcf8449 · bcf8449
1 parent 89eb0b1
commit bcf8449
Show file tree

Hide file tree

Showing 28 changed files with 709 additions and 157 deletions.
diff --git a/Analysis/simulations/binomial_growth/results/sweep_v02/._dropout.png b/Analysis/simulations/binomial_growth/results/sweep_v02/._dropout.png
diff --git a/Analysis/simulations/binomial_growth/results/sweep_v02/._growth_before_after.png b/Analysis/simulations/binomial_growth/results/sweep_v02/._growth_before_after.png
diff --git a/Analysis/simulations/binomial_growth/results/sweep_v02/._precision.png b/Analysis/simulations/binomial_growth/results/sweep_v02/._precision.png
diff --git a/docs/commands.rst b/docs/commands.rst
@@ -7,4 +7,10 @@ The Makefile contains the central entry points for common tasks related to this
 .. autoclass:: src.simulations.parametersweep.ParameterSweep
    :members: 
 
+.. autoclass:: src.simulations.fullsimulation.FullSimulation
+
+
 .. autoclass:: src.simulations.simulation.Simulation
+
+.. autoclass:: src.simulations.analysis.Analysis
+    :members:
diff --git a/docs/conf.py b/docs/conf.py
@@ -21,7 +21,7 @@
 print('and here', os.path.abspath('./'))
 
 sys.path.insert(0, os.path.abspath('../'))
-sys.path.insert(0, os.path.abspath('./'))
+#sys.path.insert(0, os.path.abspath('./'))
 
 # -- General configuration -----------------------------------------------------
 

diff --git a/docs/manuscript_simuations.md b/docs/manuscript_simuations.md
@@ -0,0 +1,14 @@
+## Introduction
+In order to determine if we have a sufficient number of reads, cells, and power to detect CHIP expansion, we generated simulated data based on mitochondrial mutation rate, cell division rates, and clone sizes. 
+We hope that this analysis will help answer two questions: 
+a) Is the mutation rate of the MTs high enough to detect different clones? 
+b) Can we properly distinguish clonal expansion in certain dominant CHIP clones from noise that may occur in the non-CHIP donor samples (unless other clones do expand, in that case we would want to learn about that as well). 
+
+
+## Methods
+
+
+## Results
+
+
+## Future Directions
diff --git a/docs/scripts/rule_description.py b/docs/scripts/rule_description.py
@@ -0,0 +1,96 @@
+"""
+This script auto-generates the snakemake workflow docs.
+Adapted from https://github.com/vanheeringen-lab/seq2science/blob/master/docs/scripts/rule_description.py
+"""
+import yaml
+import os
+import re
+
+
+from src.config import ROOT_DIR
+os.chdir(ROOT_DIR)
+
+final_md = (
+"""\
+# Per rule explanation
+
+This is an automatically generated list of all supported rules, their docstrings, and command. At the start of each \
+workflow run a list is printed of which rules will be run. And while the workflow is running it prints which rules are \
+being started and finished. This page is here to give an explanation to the user about what each rule does, and for \
+developers to find what is, and isn't yet supported.
+
+"""
+)
+
+path = "workflows/"
+rules_file = "simulation.snakefile"
+
+def get_dirty_docstrings(string):
+    splitter = re.compile("rule (.*):[\s\S]*?\"\"\"([\s\S]*?)\"\"\"", re.MULTILINE)
+    docstrings = {}
+    for match in splitter.finditer(string):
+        docstrings[match.group(1)] = match.group(2)
+    return docstrings
+
+
+def cleanup_docstring(dirty):
+    clean = {}
+    for rule, docstring in dirty.items():
+        firstline = docstring.split("\n")[1]
+
+        indentation = len(firstline) - len(firstline.lstrip())
+        docstring = docstring.replace(" " * indentation, "")
+        docstring = docstring.replace(" " * (indentation - 4), "")
+        docstring = docstring.strip("\n")
+        clean[rule] = docstring
+
+    return clean
+
+
+def cleanup_shell(dirty):
+    clean = {}
+    for rule, shell in dirty.items():
+        firstline = shell.split("\n")[1]
+
+        indentation = len(firstline) - len(firstline.lstrip())
+        docstring = "\n".join([shell_line.replace(" " * indentation, "", 1) for shell_line in shell.split("\n")])
+        docstring = docstring.strip("\n")
+        clean[rule] = docstring
+
+    return clean
+
+
+def get_dirty_shell(string):
+    splitter = re.compile("rule (.*):[\s\S]*?shell:[\s\S]*?\"\"\"[\s\S]([\s\S]*?)\"\"\"", re.MULTILINE)
+    shell_cmds = {}
+    for substring in string.split("\n\n\n"):
+        for match in splitter.finditer(substring):
+            shell_cmds[match.group(1)] = match.group(2)
+    return shell_cmds
+
+
+all_rules_doc = {}
+all_rules_shell = {}
+#for rules_file in os.listdir(path):
+
+with open(path + rules_file, 'r') as file:
+    text = file.read()
+shell_cmd = cleanup_shell(get_dirty_shell(text))
+all_rules_shell.update(shell_cmd)
+
+docstrings = cleanup_docstring(get_dirty_docstrings(text))
+all_rules_doc.update(docstrings)
+
+for rule in sorted(all_rules_doc.keys()):
+    docstring = all_rules_doc[rule]
+
+    final_md += f"#### {rule}\n"
+    final_md += f"{docstring}\n"
+    if rule in all_rules_shell:
+        final_md += "```\n"
+        final_md += f"{all_rules_shell[rule]}\n"
+        final_md += "```\n"
+    final_md += f"\n"
+
+with open("docs/workflow/all_rules.md", "w") as text_file:
+    text_file.write(final_md)
diff --git a/docs/simuations.md b/docs/simuations.md
diff --git a/docs/simulations/simple_v01.md b/docs/simulations/simple_v01.md
diff --git a/docs/workflow/._all_rules.md b/docs/workflow/._all_rules.md
diff --git a/docs/workflow/all_rules.md b/docs/workflow/all_rules.md
@@ -0,0 +1,33 @@
+# Per rule explanation
+
+This is an automatically generated list of all supported rules, their docstrings, and command. At the start of each workflow run a list is printed of which rules will be run. And while the workflow is running it prints which rules are being started and finished. This page is here to give an explanation to the user about what each rule does, and for developers to find what is, and isn't yet supported.
+
+#### all
+ Call variants using one of a few types. 
+
+:type type = {'thresh', 'GMM', 'mgatk'}
+Modes: 
+thresh: Sets a simple threshold for number of cells and reads a
+variant needs to be called a variant. 
+GMM: Gaussian mixture model for quality of the variants. 
+mgatk: This will use strand concordance, allele quality, and 
+number of reads to determine the correct threshold. This only
+works information on aligned read strands in scATAC. 
+
+Include a model for known variants, such as transversion
+
+#### model_call_lineages
+ Looks at performance for cell-clone relationship
+and MTvariant-clone relationship
+
+If a tree and not cluster labels are given, will assume MT lineages
+are based on variants, 
+
+#### model_lineage_growth_estimate
+ Will estimate the growth rate based on the number of cells in
+the before and after experiments.
+
+#### performance_growth_estimate
+ Compare the growth of the largest clones between the disease and
+non-disease samples. 
+
diff --git a/notebooks/mgatk_test/parameters/2020_09_07_mt_v01.yaml b/notebooks/mgatk_test/parameters/2020_09_07_mt_v01.yaml
@@ -0,0 +1,4 @@
+bam_f: /data2/isshamie/mito_lineage/data/processed/2020_09_07_Croker_mito/CD34_mt_fiveprime/outs/possorted_genome_bam.bam
+bam_dir: /data2/isshamie/mito_lineage/data/processed/2020_09_07_Croker_mito/CD34_mt_fiveprime/outs/
+outdir: results/
+feature_barcodes: /data2/isshamie/mito_lineage/data/processed/2020_09_07_Croker_mito/CD34_mt_fiveprime/outs/filtered_feature_bc_matrix/barcodes.tsv