From c02b7f1ce0a2d4c0e124bb5126e5604363b90296 Mon Sep 17 00:00:00 2001
From: Josiah Seaman <josiah@newline.us>
Date: Thu, 22 Aug 2019 16:37:57 +0100
Subject: [PATCH] #23 WIP: Converting Haploblocker, automating transitions
 simplifies the code. But zoom filtering is becoming cumbersome.

---
 Graph/models.py              | 15 +++-----
 HaploBlocker/haplonetwork.py | 41 ++++++++++++---------
 HaploBlocker/tests.py        | 71 +++++++++++++++++++-----------------
 3 files changed, 66 insertions(+), 61 deletions(-)

diff --git a/Graph/models.py b/Graph/models.py
index 70ae0b6..6e18101 100644
--- a/Graph/models.py
+++ b/Graph/models.py
@@ -105,19 +105,16 @@ def __hash__(self):
     def to_gfa(self, segment_id: int):
         return '\t'.join(['S', str(segment_id), self.seq])
 
-    @property
-    def specimens(self):
-        return self.nodetraversal_set
+    def specimens(self, zoom_level) -> Set[int]:
+        return self.nodetraversal_set.filter(path_zoom=zoom_level).value_list('path_id', flat=True)
 
-    @property
-    def upstream(self) -> Set[int]:
-        traverses = self.nodetraversal_set.all()  # values_list('node__id', flat=True)
+    def upstream(self, zoom_level) -> Set[int]:
+        traverses = self.nodetraversal_set.filter(path_zoom=zoom_level)  #.value_list('node_id', flat=True)
         # Node.objects.filter(id__in=traverses).values_list('id', flat=True)
         return set(t.upstream_id() for t in traverses)
 
-    @property
-    def downstream(self) -> Set[int]:
-        traverses = self.nodetraversal_set.all()
+    def downstream(self, zoom_level) -> Set[int]:
+        traverses = self.nodetraversal_set.filter(path_zoom=zoom_level).all()
         return set(t.downstream_id() for t in traverses)
 
     def __repr__(self):
diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py
index 15355aa..a536648 100644
--- a/HaploBlocker/haplonetwork.py
+++ b/HaploBlocker/haplonetwork.py
@@ -7,7 +7,7 @@
 import numpy as np
 from collections import defaultdict
 from copy import copy
-from Graph.models import Node
+from Graph.models import Node, Path
 
 BLOCK_SIZE = 20
 FILTER_THRESHOLD = 4
@@ -33,7 +33,7 @@ def signature(individual, start_locus):
     return tuple(individual[start_locus: start_locus + BLOCK_SIZE])
 
 
-def get_unique_signatures(individuals, start_locus, current_graph):
+def nodes_from_unique_signatures(individuals, start_locus, current_graph):
     """A signature is a series of BLOCK_SIZE SNPs inside of a locus.  We want to know how many
     unique signatures are present inside of one locus.  A Node is created for each unique
     signature found.
@@ -44,41 +44,46 @@ def get_unique_signatures(individuals, start_locus, current_graph):
     for individual in individuals:
         sig = signature(individual, start_locus)
         if sig not in unique_blocks:
-            unique_blocks[sig] = Node(name=f'{len(unique_blocks)}:{start_locus // BLOCK_SIZE}-{start_locus // BLOCK_SIZE}',
+            unique_blocks[sig] = Node.objects.create(  # saves to Database
+                name=f'{len(unique_blocks)}:{start_locus // BLOCK_SIZE}-{start_locus // BLOCK_SIZE}',
                                       seq=''.join(str(x) for x in sig),
                                       graph=current_graph)
     return unique_blocks
 
 
-def get_all_signatures(alleles, individuals, current_graph):
-    unique_signatures = []
-    for locus_start in range(0, len(alleles) - BLOCK_SIZE, BLOCK_SIZE):  # discards remainder
-        sig = get_unique_signatures(individuals, locus_start, current_graph)
-        unique_signatures.append(sig)
-    return unique_signatures
+def build_all_slices(alleles, individuals, current_graph):
+    """Each item in this list is a slice, representing all the possible states for one locus.
+    Inside a slice is a set of Nodes, one for each unique 'signature' or sequence state.
+    Paths that all have the same state in this slice all reference the same Node object."""
+    slices = []
+    for slice_start in range(0, len(alleles) - BLOCK_SIZE, BLOCK_SIZE):  # discards remainder
+        nodes = nodes_from_unique_signatures(individuals, slice_start, current_graph)
+        slices.append(nodes)
+    return slices
 
 
-def build_individuals(individuals, unique_signatures):
-    """Describes an individual as a list of Nodes that individual visits.
-    simplified_individuals is a list of loci which contain a list of Nodes which each contain specimen
+def build_paths(individuals, unique_signatures):
+    """Describes an individual as a Path (list of Nodes) that the individual visits (NodeTraversals).
+    accessions is a list of loci which contain a list of Nodes which each contain specimen
     build nodes:  [0] first 4 are the 4 starting signatures in window 0.
     Nodes represent a collection of individuals with the same signature at that locus
     For each node list which individuals are present at that node"""
-    simplified_individuals = []
+    # TODO: It may be more performant to merge build_all_slices and build_paths so that global lists are never stored
+    accessions = []
     for i_specimen, specimen in enumerate(individuals):
-        my_simplification = []
+        my_path = Path.objects.create(accession=str(i_specimen))
         for w, window in enumerate(unique_signatures):  # the length of the genome
             sig = signature(specimen, w * BLOCK_SIZE)
-            my_simplification.append(unique_signatures[w][sig])
-        simplified_individuals.append(my_simplification)
-    return simplified_individuals
+            my_path.append_node(unique_signatures[w][sig], '+')
+        accessions.append(my_path)
+    return accessions
 
 
 def populate_transitions(simplified_individuals):
     """
     List transition rates from one node to all other upstream and downstream.
     This method populates Node.specimens and begins the process of side-effecting Nodes.
-    To rebuild a fresh Graph copy, you must start at get_all_signatures()
+    To rebuild a fresh Graph copy, you must start at build_all_slices()
     :param simplified_individuals:
     """
     for i, indiv in enumerate(simplified_individuals):
diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py
index 665d84c..f023385 100644
--- a/HaploBlocker/tests.py
+++ b/HaploBlocker/tests.py
@@ -2,14 +2,14 @@
 
 import Graph.utils
 import Graph.views
-from Graph.models import GraphGenome
+from Graph.models import GraphGenome, Path
 from vgbrowser.settings import BASE_DIR
 import unittest
 import os
 # Create your tests here.
 # from HaploBlocker.models import Node, Path, Edge
 from HaploBlocker.haplonetwork import Node, split_one_group
-from HaploBlocker.haplonetwork import read_data, get_all_signatures, build_individuals, get_unique_signatures, \
+from HaploBlocker.haplonetwork import read_data, build_all_slices, build_paths, nodes_from_unique_signatures, \
     populate_transitions, simple_merge, neglect_nodes, split_groups
 
 #
@@ -27,21 +27,20 @@ class HaploTest(unittest.TestCase):
     @classmethod
     def setUpClass(self) -> None:
         """Reads the input data file once.  Tests that need a fresh graph must
-        call create_nodes()"""
+        call create_graph()"""
         print(os.getcwd())
         self.alleles, self.individuals = read_data(os.path.join(BASE_DIR, "test_data/KE_chromo10.txt"))
 
-    def create_nodes(self, graph_name):
-        """Tests that need a fresh graph must call create_nodes() FIRST!
+    def create_graph(self, graph_name):
+        """Tests that need a fresh graph must call create_graph() FIRST!
         Graph summarization works by side effecting Node objects.  Tests can not run independently
         with order dependent side effects.  This method is slow, so don't use it unless you
         need it.
         :param graph_name: """
         graph = GraphGenome.objects.create(name=graph_name)
-        self.unique_signatures = get_all_signatures(self.alleles, self.individuals, graph)
-        self.simplified_individuals = build_individuals(self.individuals, self.unique_signatures)
-        # G = build_graph(simplified_individuals)
-        populate_transitions(self.simplified_individuals)
+        slices = build_all_slices(self.alleles, self.individuals, graph)
+        self.paths = build_paths(self.individuals, slices)
+        return graph
 
 
     def test_read(self):
@@ -56,29 +55,30 @@ def test_build_individuals(self):
 
     def internal_build_individuals(self, alleles, individuals):
         graph = GraphGenome.objects.create(name='internal_build_individuals')
-        unique_signatures = get_all_signatures(alleles, individuals, graph)
+        unique_signatures = build_all_slices(alleles, individuals, graph)
         peek = repr(list(unique_signatures[21].values()))
         assert peek == '[N0:21-21(00002202022222220202), N1:21-21(00202202022222220202), N2:21-21(00022200000000000000), N3:21-21(00000000000000000000), N4:21-21(00002200000000000000), N5:21-21(00022202022220020002), N6:21-21(02000000000000000000), N7:21-21(00002202222220020022)]', peek
-        simplified_individuals = build_individuals(individuals, unique_signatures)
+        simplified_individuals = build_paths(individuals, unique_signatures)
         peek = repr(simplified_individuals[500][:100])
         assert peek == '[N2:0-0(00000000000000000000), N2:1-1(00000000000000000000), N2:2-2(00000000000000000000), N2:3-3(00000000000000000000), N2:4-4(00000000000000000000), N2:5-5(00000000000000000000), N3:6-6(00000000000000000000), N3:7-7(00000000000000000000), N3:8-8(00000000000000000000), N2:9-9(00000000000000000000), N0:10-10(00000000000000000000), N1:11-11(00000000000000000000), N2:12-12(00000000000000000000), N2:13-13(00000000000000000000), N2:14-14(00000000000000000000), N2:15-15(00000000000000000000), N3:16-16(00000000000000000000), N3:17-17(00000000000000000000), N4:18-18(00000000000000000000), N3:19-19(00000000000000000000), N5:20-20(00000000000000000000), N3:21-21(00000000000000000000), N3:22-22(00000000000000000000), N10:23-23(00200000000000000000), N4:24-24(00002200222220002000), N3:25-25(02000222220002020222), N4:26-26(20022000002002220002), N3:27-27(22222202020222220000), N1:28-28(00000000000000000000), N1:29-29(00000000000000000022), N4:30-30(00002222222000002200), N3:31-31(00022222202000000000), N21:32-32(00000020202200022020), N1:33-33(02202220022020222000), N1:34-34(00020002000202222002), N1:35-35(22220002220022200022), N1:36-36(22222200000000000000), N1:37-37(00202002222220000200), N1:38-38(00000200000202022200), N1:39-39(02202000202202220000), N1:40-40(00020222200020000020), N1:41-41(20220020022200022200), N1:42-42(00000000000000000000), N1:43-43(00000000000000000000), N1:44-44(00000000000000000000), N1:45-45(00000000000000000000), N1:46-46(00000002220220020020), N1:47-47(00202220222220222202), N1:48-48(00000000000000000002), N1:49-49(20002200000002220022), N1:50-50(22020002002020202022), N1:51-51(02202222220222202000), N1:52-52(20000020000000000000), N1:53-53(00000000000000000000), N1:54-54(00000000000000000000), N1:55-55(00220220000200000220), N1:56-56(20000000202022022020), N1:57-57(20222022022202222220), N1:58-58(22022202222222020200), N1:59-59(22202200202220202220), N1:60-60(22020022220200022022), N1:61-61(20202220000220000220), N1:62-62(00022002000000000000), N1:63-63(00000220000000000000), N1:64-64(00000000000220200000), N1:65-65(00022020200000020022), N1:66-66(20020222222020200020), N1:67-67(00000000000000202222), N1:68-68(22222222000202222202), N1:69-69(22022222020020000022), N1:70-70(00002002220022222200), N1:71-71(22002020020202000000), N1:72-72(00022202000202220020), N1:73-73(22000000000000200020), N1:74-74(22220222220200202202), N1:75-75(00022202222200000000), N1:76-76(00000220220200200022), N1:77-77(02200202020020200000), N0:78-78(00002000000000000000), N0:79-79(00000000000000000000), N1:80-80(00000000000022220000), N1:81-81(00000000000000000000), N1:82-82(00022220200202202202), N1:83-83(20202222200202202202), N1:84-84(00000020000000000000), N1:85-85(00222022020000000002), N1:86-86(22020222020222222000), N1:87-87(00022222002020222022), N1:88-88(00002222000000000200), N1:89-89(00000000000000220022), N1:90-90(22020202200020222220), N1:91-91(00002000002220002222), N1:92-92(22200000000000000000), N1:93-93(00000000000000000000), N1:94-94(00202022200202222222), N1:95-95(22222202202020222222), N1:96-96(00222220200202222020), N1:97-97(22002202220222222022), N0:98-98(20222222222222020220), N0:99-99(20222222220222222002)]', peek
         assert len(simplified_individuals) == 501 and len(simplified_individuals[60]) == 1638
 
 
     def test_get_unique_signatures(self):
-        unique_blocks = get_unique_signatures(self.individuals, 0)
+        graph = GraphGenome.objects.create(name='test_get_unique_signatures')
+        unique_blocks = nodes_from_unique_signatures(self.individuals, 0, graph)
         assert len(unique_blocks) == 4
-        assert unique_blocks.__repr__() == '{(0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0): N0(0, 0), ' \
-                                           '(0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): N1(0, 0), ' \
-                                           '(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0): N2(0, 0), ' \
-                                           '(2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2): N3(0, 0)}'
+        peek = repr(list(unique_blocks.values()))
+        assert peek == \
+               '[N0:0-0(02002020002000220000), N1:0-0(00220202220222002222), ' \
+               'N2:0-0(00000000000000000000), N3:0-0(20220202220222002222)]', peek
 
     @unittest.skip
     def test_no_duplicate_nodes(self):
-        self.create_nodes('test')
+        graph = self.create_graph('test')
         unique_nodes = set()
         duplicates_found = 0
-        for locus in self.simplified_individuals:
+        for locus in self.paths:
             for node in locus:
                 # assert isinstance(node, Node)
                 if node in unique_nodes:  # If two nodes have the same __hash__ they'll be "in"
@@ -89,13 +89,17 @@ def test_no_duplicate_nodes(self):
         assert duplicates_found == 0, f"Found {duplicates_found} duplicated nodes in the graph"
 
 
-    def _test_simple_merge(self, all_nodes):
-        assert len(all_nodes) == 7180
-        summary1 = simple_merge(all_nodes)
-        assert len(summary1) == 3690
-        return summary1
+    def _test_simple_merge(self, graph, zoom_level):
+        # these tests could be made independent of test_workflow, but it would be slower
+        assert graph.highest_zoom_level() == zoom_level
+        assert len(graph) == 7180
+        status = simple_merge(graph)
+        assert Path.objects.filter(graph=graph, zoom=zoom_level + 1).count() == \
+               Path.objects.filter(graph=graph, zoom=zoom_level + 0).count()
+        assert Path.objects.filter(graph=graph, zoom=zoom_level+1) == 3690
+        return status
 
-    def _test_neglect_nodes(self, all_nodes):
+    def _test_neglect_nodes(self, all_nodes, zoom_level):
         summary2 = neglect_nodes(all_nodes)
         assert len(summary2) == 2854
         unchanged = neglect_nodes(summary2, 0)
@@ -126,25 +130,24 @@ def test_split_one_group(self):
         assert new_node in g.node('96').upstream and g.node('95') in g.node('96').upstream
         assert g.node('94') not in g.node('96').upstream
 
-    def _test_split_groups(self, all_nodes):
-        summary3 = split_groups(all_nodes)
+    def _test_split_groups(self, graph, zoom_level):
+        summary3 = split_groups(graph)
         assert len(summary3) > 10
         return summary3
 
 
     def test_workflow(self):
-        self.create_nodes('test')
-        all_nodes = [node for window in self.unique_signatures for node in window.values()]  # think about referencing and deletion
-        summary1 = self._test_simple_merge(all_nodes)
-        summary2 = self._test_neglect_nodes(summary1)
-        summary3 = self._test_split_groups(summary2)
+        graph = self.create_graph('test')
+        summary1 = self._test_simple_merge(graph, 0)
+        summary2 = self._test_neglect_nodes(graph, 1)
+        summary3 = self._test_split_groups(graph, 2)
         assert len(summary1) > len(summary2) > len(summary3), "Each summarization should result in less nodes"
-        summary4 = simple_merge(summary3)
+        summary4 = simple_merge(summary3, 3)
         bad = summary3[2]
         print(bad.details())
 
-        # test_signatures = get_all_signatures(alleles, individuals)
-        # test_individuals = build_individuals(individuals, test_signatures)
+        # test_signatures = build_all_slices(alleles, individuals)
+        # test_individuals = build_paths(individuals, test_signatures)
         # populate_transitions(test_individuals)  # no return val
         #
         # test1 = test_simple_merge(test_signatures)