#26 Simple_merge test cases improved. internal_build_individuals() is…

… working.
graph-genome · Aug 27, 2019 · 59c18fd · 59c18fd
1 parent bc49253
commit 59c18fd
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 22 deletions.
diff --git a/Graph/models.py b/Graph/models.py
@@ -152,6 +152,8 @@ def append_node_to_path(self, node_id, strand, path_name, zoom) -> None:
     def node(self, node_name):
         return Node.objects.get(name=node_name, graph=self)
 
+    def highest_zoom_level(self):
+        return self.zoomlevel_set.all().order_by('-zoom').first().zoom
 
 
 class Node(models.Model):
@@ -192,7 +194,7 @@ def to_gfa(self, segment_id: int):
         return '\t'.join(['S', str(segment_id), self.seq])
 
     def specimens(self, zoom_level) -> List[int]:
-        return self.nodetraversal_set.filter(path_zoom=zoom_level).value_list('path_id', flat=True)
+        return list(self.nodetraversal_set.filter(path_zoom=zoom_level).value_list('path_id', flat=True))
 
     def upstream_ids(self, zoom_level) -> Set[int]:
         """Returns the node ids that are upstream of this node."""

diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py
@@ -75,11 +75,8 @@ def build_paths(individuals, unique_signatures, graph):
     for i_specimen, specimen in enumerate(individuals):
         my_path = Path.objects.create(accession=str(i_specimen), graph=graph)
         my_sigs = [unique_signatures[w][signature(specimen, w * BLOCK_SIZE)] for w in range(len(unique_signatures))]
-        traverses = [NodeTraversal(node=sig, path=my_path, strand='+') for sig in my_sigs]
-        NodeTraversal.objects.bulk_create(traverses, 1000)
-        # for w, window in enumerate(unique_signatures):  # the length of the genome
-        #     sig = signature(specimen, w * BLOCK_SIZE)
-        #     my_path.append_node(unique_signatures[w][sig], '+')
+        traverses = [NodeTraversal(node=sig, path=my_path, strand='+', order=i) for i, sig in enumerate(my_sigs)]
+        NodeTraversal.objects.bulk_create(traverses, 100)
         accessions.append(my_path)
     print(f"Done building {len(accessions)}Paths")
     return accessions
@@ -142,8 +139,7 @@ def simple_merge(current_level: ZoomLevel) -> ZoomLevel:
     """
     #TODO: Paths start fully populated with redundant NodeTraversals.  Editing NodeTraversals,
     # moves to newly created Nodes.  Global bool for whether or not a particular path was modified.
-    zoom = current_level.zoom
-    next_level = ZoomLevel.objects.create(graph=current_level.graph, zoom=zoom + 1)
+    next_level, zoom = prep_next_summary_layer(current_level)
     for my_node in current_level.nodes():
         # only one Node Downstream, no matter the number of specimens
         if len(my_node.downstream_ids(zoom)) == 1:
@@ -167,6 +163,14 @@ def simple_merge(current_level: ZoomLevel) -> ZoomLevel:
     return next_level
 
 
+def prep_next_summary_layer(current_level):
+    zoom = current_level.zoom
+    assert current_level.graph.highest_zoom_level() == zoom, \
+        "You should only be summarizing the topmost layer"
+    next_level = ZoomLevel.objects.create(graph=current_level.graph, zoom=zoom + 1)
+    return next_level, zoom
+
+
 def delete_node(node, cutoff):
     """Changes references to this node to add to references to Node.NOTHING"""
     if cutoff < 1:
@@ -183,6 +187,9 @@ def neglect_nodes(all_nodes, deletion_cutoff=FILTER_THRESHOLD):
     """Deletes nodes if they have too few specimens supporting them defined by
     :param deletion_cutoff
     :returns a new list of nodes lacking the pruned nodes in all_nodes"""
+
+    # next_level, zoom = prep_next_summary_layer(current_level)
+
     nodes_to_delete = set()
     for node in all_nodes:
         if len(node.specimens) <= deletion_cutoff:
@@ -197,20 +204,20 @@ def split_one_group(prev_node, anchor, next_node):
     """ Called when up.specimens == down.specimens
     Comment: That is actually the case we want to split up to obtain longer blocks later
     Extension of full windows will take care of potential loss of information later"""
-    my_specimens = copy(anchor.specimens)  # important to copy or side effects occur
+    my_specimens = anchor.specimens()  # important to copy or side effects occur
     if not prev_node.is_nothing():  # normal case
-        my_specimens = my_specimens.intersection(prev_node.specimens)
+        my_specimens = my_specimens.intersection(prev_node.specimens())
     if not next_node.is_nothing():  # normal case
-        my_specimens = my_specimens.intersection(next_node.specimens)
+        my_specimens = my_specimens.intersection(next_node.specimens())
     if prev_node.is_nothing() and next_node.is_nothing():  # exceptional: both are nothing node
-        my_specimens = copy(anchor.specimens)
+        my_specimens = copy(anchor.specimens())
         # removing all specimens that transition to nothing
         for n in anchor.downstream.keys():
             if n.is_nothing():  # remove dead leads
-                my_specimens -= n.specimens
+                my_specimens -= n.specimens()
         for n in anchor.upstream.keys():
             if n.is_nothing():  # remove dead leads
-                my_specimens -= n.specimens
+                my_specimens -= n.specimens()
 
     my_upstream, my_downstream = copy(prev_node.upstream), copy(next_node.downstream)
     if prev_node.is_nothing():  # Rare case
@@ -257,6 +264,9 @@ def split_groups(all_nodes: List[Node]):
     TODO: Ideally, the database would retain some record of how many nucleotides are shared between
     the two new haplotype nodes."""
     new_graph = list(all_nodes)
+    # next_level, zoom = prep_next_summary_layer(current_level)
+
+
     for node in all_nodes:
         # check if all transition upstream match with one of my downstream nodes
         if len(node.specimens) > 0:

diff --git a/HaploBlocker/tests.py b/HaploBlocker/tests.py
@@ -65,9 +65,13 @@ def internal_build_individuals(self, alleles, individuals):
         peek = repr(list(unique_signatures[21].values()))
         assert peek == '[N0:21-21(00002202022222220202), N1:21-21(00202202022222220202), N2:21-21(00022200000000000000), N3:21-21(00000000000000000000), N4:21-21(00002200000000000000), N5:21-21(00022202022220020002), N6:21-21(02000000000000000000), N7:21-21(00002202222220020022)]', peek
         simplified_individuals = build_paths(individuals, unique_signatures, graph)
-        peek = repr(simplified_individuals[500][:100])
-        assert peek == '[N2:0-0(00000000000000000000), N2:1-1(00000000000000000000), N2:2-2(00000000000000000000), N2:3-3(00000000000000000000), N2:4-4(00000000000000000000), N2:5-5(00000000000000000000), N3:6-6(00000000000000000000), N3:7-7(00000000000000000000), N3:8-8(00000000000000000000), N2:9-9(00000000000000000000), N0:10-10(00000000000000000000), N1:11-11(00000000000000000000), N2:12-12(00000000000000000000), N2:13-13(00000000000000000000), N2:14-14(00000000000000000000), N2:15-15(00000000000000000000), N3:16-16(00000000000000000000), N3:17-17(00000000000000000000), N4:18-18(00000000000000000000), N3:19-19(00000000000000000000), N5:20-20(00000000000000000000), N3:21-21(00000000000000000000), N3:22-22(00000000000000000000), N10:23-23(00200000000000000000), N4:24-24(00002200222220002000), N3:25-25(02000222220002020222), N4:26-26(20022000002002220002), N3:27-27(22222202020222220000), N1:28-28(00000000000000000000), N1:29-29(00000000000000000022), N4:30-30(00002222222000002200), N3:31-31(00022222202000000000), N21:32-32(00000020202200022020), N1:33-33(02202220022020222000), N1:34-34(00020002000202222002), N1:35-35(22220002220022200022), N1:36-36(22222200000000000000), N1:37-37(00202002222220000200), N1:38-38(00000200000202022200), N1:39-39(02202000202202220000), N1:40-40(00020222200020000020), N1:41-41(20220020022200022200), N1:42-42(00000000000000000000), N1:43-43(00000000000000000000), N1:44-44(00000000000000000000), N1:45-45(00000000000000000000), N1:46-46(00000002220220020020), N1:47-47(00202220222220222202), N1:48-48(00000000000000000002), N1:49-49(20002200000002220022), N1:50-50(22020002002020202022), N1:51-51(02202222220222202000), N1:52-52(20000020000000000000), N1:53-53(00000000000000000000), N1:54-54(00000000000000000000), N1:55-55(00220220000200000220), N1:56-56(20000000202022022020), N1:57-57(20222022022202222220), N1:58-58(22022202222222020200), N1:59-59(22202200202220202220), N1:60-60(22020022220200022022), N1:61-61(20202220000220000220), N1:62-62(00022002000000000000), N1:63-63(00000220000000000000), N1:64-64(00000000000220200000), N1:65-65(00022020200000020022), N1:66-66(20020222222020200020), N1:67-67(00000000000000202222), N1:68-68(22222222000202222202), N1:69-69(22022222020020000022), N1:70-70(00002002220022222200), N1:71-71(22002020020202000000), N1:72-72(00022202000202220020), N1:73-73(22000000000000200020), N1:74-74(22220222220200202202), N1:75-75(00022202222200000000), N1:76-76(00000220220200200022), N1:77-77(02200202020020200000), N0:78-78(00002000000000000000), N0:79-79(00000000000000000000), N1:80-80(00000000000022220000), N1:81-81(00000000000000000000), N1:82-82(00022220200202202202), N1:83-83(20202222200202202202), N1:84-84(00000020000000000000), N1:85-85(00222022020000000002), N1:86-86(22020222020222222000), N1:87-87(00022222002020222022), N1:88-88(00002222000000000200), N1:89-89(00000000000000220022), N1:90-90(22020202200020222220), N1:91-91(00002000002220002222), N1:92-92(22200000000000000000), N1:93-93(00000000000000000000), N1:94-94(00202022200202222222), N1:95-95(22222202202020222222), N1:96-96(00222220200202222020), N1:97-97(22002202220222222022), N0:98-98(20222222222222020220), N0:99-99(20222222220222222002)]', peek
-        assert len(simplified_individuals) == 501 and len(simplified_individuals[60]) == 1638
+        traverses = simplified_individuals[500].nodes.filter(order__lt=100)  # [:100]
+        nodes = [t.node for t in traverses.prefetch_related('node').all()]
+        peek = repr(nodes)
+        self.maxDiff = None  # tells the debugger to show the whole thing
+        self.assertEqual(peek, '[N2:0-0(00000000000000000000), N2:1-1(00000000000000000000), N2:2-2(00000000000000000000), N2:3-3(00000000000000000000), N2:4-4(00000000000000000000), N2:5-5(00000000000000000000), N3:6-6(00000000000000000000), N3:7-7(00000000000000000000), N3:8-8(00000000000000000000), N2:9-9(00000000000000000000), N0:10-10(00000000000000000000), N1:11-11(00000000000000000000), N2:12-12(00000000000000000000), N2:13-13(00000000000000000000), N2:14-14(00000000000000000000), N2:15-15(00000000000000000000), N3:16-16(00000000000000000000), N3:17-17(00000000000000000000), N4:18-18(00000000000000000000), N3:19-19(00000000000000000000), N5:20-20(00000000000000000000), N3:21-21(00000000000000000000), N3:22-22(00000000000000000000), N10:23-23(00200000000000000000), N4:24-24(00002200222220002000), N3:25-25(02000222220002020222), N4:26-26(20022000002002220002), N3:27-27(22222202020222220000), N1:28-28(00000000000000000000), N1:29-29(00000000000000000022), N4:30-30(00002222222000002200), N3:31-31(00022222202000000000), N21:32-32(00000020202200022020), N1:33-33(02202220022020222000), N1:34-34(00020002000202222002), N1:35-35(22220002220022200022), N1:36-36(22222200000000000000), N1:37-37(00202002222220000200), N1:38-38(00000200000202022200), N1:39-39(02202000202202220000), N1:40-40(00020222200020000020), N1:41-41(20220020022200022200), N1:42-42(00000000000000000000), N1:43-43(00000000000000000000), N1:44-44(00000000000000000000), N1:45-45(00000000000000000000), N1:46-46(00000002220220020020), N1:47-47(00202220222220222202), N1:48-48(00000000000000000002), N1:49-49(20002200000002220022), N1:50-50(22020002002020202022), N1:51-51(02202222220222202000), N1:52-52(20000020000000000000), N1:53-53(00000000000000000000), N1:54-54(00000000000000000000), N1:55-55(00220220000200000220), N1:56-56(20000000202022022020), N1:57-57(20222022022202222220), N1:58-58(22022202222222020200), N1:59-59(22202200202220202220), N1:60-60(22020022220200022022), N1:61-61(20202220000220000220), N1:62-62(00022002000000000000), N1:63-63(00000220000000000000), N1:64-64(00000000000220200000), N1:65-65(00022020200000020022), N1:66-66(20020222222020200020), N1:67-67(00000000000000202222), N1:68-68(22222222000202222202), N1:69-69(22022222020020000022), N1:70-70(00002002220022222200), N1:71-71(22002020020202000000), N1:72-72(00022202000202220020), N1:73-73(22000000000000200020), N1:74-74(22220222220200202202), N1:75-75(00022202222200000000), N1:76-76(00000220220200200022), N1:77-77(02200202020020200000), N0:78-78(00002000000000000000), N0:79-79(00000000000000000000), N1:80-80(00000000000022220000), N1:81-81(00000000000000000000), N1:82-82(00022220200202202202), N1:83-83(20202222200202202202), N1:84-84(00000020000000000000), N1:85-85(00222022020000000002), N1:86-86(22020222020222222000), N1:87-87(00022222002020222022), N1:88-88(00002222000000000200), N1:89-89(00000000000000220022), N1:90-90(22020202200020222220), N1:91-91(00002000002220002222), N1:92-92(22200000000000000000), N1:93-93(00000000000000000000), N1:94-94(00202022200202222222), N1:95-95(22222202202020222222), N1:96-96(00222220200202222020), N1:97-97(22002202220222222022), N0:98-98(20222222222222020220), N0:99-99(20222222220222222002)]')
+        self.assertEqual(len(simplified_individuals),501)
+        self.assertEqual(simplified_individuals[60].nodes.count(), 1638)
 
 
     def test_get_unique_signatures(self):
@@ -95,16 +99,16 @@ def test_no_duplicate_nodes(self):
         assert duplicates_found == 0, f"Found {duplicates_found} duplicated nodes in the graph"
 
 
-    def _test_simple_merge(self, graph, zoom_level: int) -> ZoomLevel:
+    def _test_simple_merge(self, graph: GraphGenome, zoom_level: int) -> ZoomLevel:
         # these tests could be made independent of test_workflow, but it would be slower
         assert graph.highest_zoom_level() == zoom_level
         starting_level = ZoomLevel.objects.get(graph=graph, zoom=zoom_level)
-        assert len(starting_level) == 7180
+        self.assertEqual(len(starting_level), 7180)
         next_level = simple_merge(starting_level)
         #Test every Path has a representative in this ZoomLevel
-        assert Path.objects.filter(graph=graph, zoom=zoom_level + 1).count() == \
-               Path.objects.filter(graph=graph, zoom=zoom_level + 0).count()
-        assert NodeTraversal.objects.filter(graph=graph, zoom=zoom_level+1) == 3690
+        self.assertEqual(Path.objects.filter(graph=graph, zoom=zoom_level + 1).count(),
+                         Path.objects.filter(graph=graph, zoom=zoom_level + 0).count())
+        self.assertEqual(NodeTraversal.objects.filter(graph=graph, zoom=zoom_level+1).count(), 3690) #*501?
         return next_level
 
     @skip