diff --git a/Graph/gfa.py b/Graph/gfa.py index 0520012..106367a 100644 --- a/Graph/gfa.py +++ b/Graph/gfa.py @@ -116,7 +116,7 @@ def from_graph(cls, graph: GraphGenome): # TODO: should be given ZoomLevel inst node_series = ",".join(visits) connections = ",".join(['*'] * path.nodes.count()) # count -1? gfa.add_line('\t'.join(['P', path.accession, node_series, connections])) - for node in graph.nucleotide_level.nodes: # in no particular order + for node in graph.nucleotide_level.nodes_xrange(): # in no particular order gfa.add_line('\t'.join(['S', str(node.name), node.seq])) return cls(gfa, "from Graph") diff --git a/Graph/migrations/0008_remove_path_summarized_by.py b/Graph/migrations/0008_remove_path_summarized_by.py new file mode 100644 index 0000000..c0eeb2f --- /dev/null +++ b/Graph/migrations/0008_remove_path_summarized_by.py @@ -0,0 +1,17 @@ +# Generated by Django 2.2.1 on 2019-09-03 16:59 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('Graph', '0007_NodeTraversal_unique_together_order'), + ] + + operations = [ + migrations.RemoveField( + model_name='path', + name='summarized_by', + ), + ] diff --git a/Graph/models.py b/Graph/models.py index 7e99afb..cbc695c 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -25,21 +25,17 @@ def create(self, graph, zoom, blank_layer=False) -> 'ZoomLevel': return me # We've done all the necessary work # Copy previous level in entirety previous_level = graph.zoomlevel_set.get(zoom=zoom - 1) - Node.objects.bulk_create([Node(name=n.name, zoom=me) for n in previous_level.nodes], 300) + Node.objects.bulk_create([Node(name=n.name, zoom=me) for n in previous_level.nodes_xrange()], 100) # TODO: This loop can be sped up by bulk_create and bulk_update - start = previous_level.paths.aggregate(Max('id'))['rating__max'] # TODO make lazy_paths() generator method - stop = previous_level.paths.aggregate(Min('id'))['rating__min'] - for path_id in range(start, stop): - if Path.objects.exists(id=path_id): - path = Path.objects.get(id=path_id) - name = path.name - p = Path(accession=name, zoom=me) # new Path for a new level - p.save() - # TODO: this is REALLY SLOW AND WASTEFUL! - # Makes a full copy of every traversal in the Path so new copies can be edited - copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order) - for traverse in path.nodetraversal_set.all()] - NodeTraversal.objects.bulk_create(copies, 100) + for path in previous_level.path_xrange(): + name = path.name + p = Path(accession=name, zoom=me) # new Path for a new level + p.save() + # TODO: this is REALLY SLOW AND WASTEFUL! + # Makes a full copy of every traversal in the Path so new copies can be edited + copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order) + for traverse in path.nodetraversal_set.all()] + NodeTraversal.objects.bulk_create(copies, 100) return me @@ -88,6 +84,25 @@ def nodes(self) -> QuerySet: def node(self, node_name): return Node.objects.get(name=node_name, zoom=self) + def path_xrange(self): + """Lazy evaluation of Paths to ensure that we don't overrun SQL""" + start = self.paths.aggregate(Min('id'))['id__min'] + stop = self.paths.aggregate(Max('id'))['id__max'] + 1 + for path_id in range(start, stop): + try: + yield Path.objects.get(id=path_id) + except Path.DoesNotExist: + pass # path ids sometimes will have gaps + + def nodes_xrange(self): + """Lazy evaluation of Nodes to ensure that we don't overrun SQL""" + start = self.nodes.aggregate(Min('id'))['id__min'] + stop = self.nodes.aggregate(Max('id'))['id__max'] + 1 + for path_id in range(start, stop): + try: + yield Node.objects.get(id=path_id) + except Node.DoesNotExist: + pass # node ids sometimes will have gaps class GraphManager(models.Manager): @@ -301,7 +316,7 @@ def summary_child(self): if self.zoom.zoom == 0: return None previous_zoom = self.zoom.graph.zoomlevel_set.get(zoom=self.zoom.zoom - 1) - return previous_zoom.node_set.get(accession=self.accession) + return previous_zoom.path_set.get(accession=self.accession) def append_gfa_nodes(self, nodes): assert hasattr(nodes[0], 'orient') and hasattr(nodes[0], 'name'), 'Expecting gfapy.Gfa.path' diff --git a/Graph/test.py b/Graph/test.py index 4801adf..8af2213 100644 --- a/Graph/test.py +++ b/Graph/test.py @@ -56,9 +56,10 @@ def test_graph_factory(self): def test_summary_storage(self): graph = self.test_example_graph() - + zoom0 = graph.nucleotide_level zoom1 = ZoomLevel.objects.create(graph=graph, zoom=1, blank_layer=False) - path1 = zoom1.paths.filter(accession='a').first() + path1 = zoom1.paths.get(accession='a') + self.assertEqual(zoom1.paths.count(), 5) new_node = Node.objects.create(seq='ACGTCGGA', name='2*2', zoom=zoom1) base_nodes = graph.nucleotide_level.nodes base_nodes.filter(name__in=['1', '2', '4']).update(summarized_by=new_node) @@ -72,11 +73,10 @@ def test_summary_storage(self): self.assertTrue(path1.summary_child), # "Path should be linked to its child." self.assertEqual(zoom1.paths.count(), 5) # ZoomLevel - zoom0 = graph.nucleotide_level - self.assertEqual(len(zoom1), len(zoom0) - 2) - self.assertEqual(zoom1.node_ids(),set(range(23, 42)),)#{23,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41} + self.assertEqual(len(zoom1), len(zoom0) - 3 + 1) + self.assertEqual(zoom1.node_ids(),set(range(23, 42)) - {24},) # 24 deleted self.assertEqual(zoom0.node_ids(), set(range(1,21))) - names = [x.name for x in zoom1.nodes] + names = zoom1.nodes.values_list('name', flat=True) self.assertEqual(names, ['5', '6', '2*2']) sequences = [x.seq for x in zoom1.nodes] self.assertEqual(sequences, ['C', 'AGTACG', 'ACGTCGGA']) diff --git a/HaploBlocker/haplonetwork.py b/HaploBlocker/haplonetwork.py index 92744cc..853785c 100644 --- a/HaploBlocker/haplonetwork.py +++ b/HaploBlocker/haplonetwork.py @@ -103,34 +103,6 @@ def populate_transitions(simplified_individuals): # node.upstream[Node.NOTHING] += 1 -def update_transition(node): - """Only transition values for nodes already listed in upstream and downstream will be calculated.""" - if not node.is_nothing(): - update_stream_transitions(node, 'upstream') - update_stream_transitions(node, 'downstream') - - return node - - -def update_stream_transitions(node, stream): - """This will updated either upstream or downstream transition counts based on the - the value of 'stream'. This is a meta-programming function that requires the exact - name of the class field 'upstream' or 'downstream' to work.""" - g = getattr # - running = g(node, stream).keys() - setattr(node, stream, defaultdict(lambda: 0)) - for n in running: - if not n.is_nothing(): - g(node, stream)[n] = len(node.specimens.intersection(n.specimens)) - accounted_upstream = sum(g(node, stream).values()) - g(node, stream)[Node.NOTHING] - g(node, stream)[Node.NOTHING] = len(node.specimens) - accounted_upstream - assert all([count > -1 for count in g(node, stream).values()]), node.details() - # Cleans up old keys including Node.NOTHING - to_be_deleted = {key for key, count in g(node, stream).items() if count == 0} - for key in to_be_deleted: - g(node, stream).pop(key, None) - - def simple_merge(current_level: ZoomLevel) -> bool: """ Side effects full_graph by merging any consecutive nodes that have identical specimens and removing the redundant my_node from full_graph. @@ -150,35 +122,31 @@ def simple_merge(current_level: ZoomLevel) -> bool: # Node.objects.filter(path_set == ) modification_happened = False path_ids = current_level.paths.values_list('id', flat=True) - for node_id in range(1, current_level.node_set.order_by('-id').first().id): - try: - my_node = Node.objects.get(id=node_id, zoom=current_level) - - # only one Node Downstream, no matter the number of specimens - if len(my_node.downstream_ids()) == 1: - d = my_node.nodetraversal_set.first().downstream() - if d: - modification_happened = True - next_node = d.node # fetched from DB - if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee... - # Torsten deletes my_node and modifies next_node - merged_node = Node.objects.create(name=f'{my_node.name}*{current_level.zoom}', - zoom=current_level) - for x in [my_node, next_node]: - x.summarized_by = merged_node - x.save() # TODO: doesn't work because reading and writing same layer. next_node gets deleted soon - - # edit existing traversals - next_node.nodetraversal_set.\ - filter(path_id__in=path_ids).\ - update(node_id=merged_node.id) - - # delete my_node and all associates - query = my_node.nodetraversal_set.filter(path_id__in=path_ids) - query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/ - # TODO: merged_node.start = my_node.start, length = my_node.length + next_node.length - except Node.DoesNotExist: - pass # node ids are not entirely dense + for my_node in current_level.nodes_xrange(): + + # only one Node Downstream, no matter the number of specimens + if len(my_node.downstream_ids()) == 1: + d = my_node.nodetraversal_set.first().downstream() + if d: + modification_happened = True + next_node = d.node # fetched from DB + if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee... + # Torsten deletes my_node and modifies next_node + merged_node = Node.objects.create(name=f'{my_node.name}*{current_level.zoom}', + zoom=current_level) + for x in [my_node, next_node]: + x.summarized_by = merged_node + x.save() # TODO: doesn't work because reading and writing same layer. next_node gets deleted soon + + # edit existing traversals + next_node.nodetraversal_set.\ + filter(path_id__in=path_ids).\ + update(node_id=merged_node.id) + + # delete my_node and all associates + query = my_node.nodetraversal_set.filter(path_id__in=path_ids) + query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/ + # TODO: merged_node.start = my_node.start, length = my_node.length + next_node.length return modification_happened @@ -205,7 +173,7 @@ def neglect_nodes(zoom_level : ZoomLevel, deletion_cutoff=FILTER_THRESHOLD): # next_level, zoom = prep_next_summary_layer(current_level) - for node in zoom_level.nodes: # TODO optimize distinct count + for node in zoom_level.nodes_xrange(): # TODO optimize distinct count if len(node.specimens()) <= deletion_cutoff: delete_node(node, deletion_cutoff, zoom_level) @@ -254,7 +222,7 @@ def split_groups(zoom_level: ZoomLevel): TODO: Ideally, the database would retain some record of how many nucleotides are shared between the two new haplotype nodes.""" - for node in zoom_level.nodes: + for node in zoom_level.nodes_xrange(): # check if all transition upstream match with one of my downstream nodes if len(node.specimens()) > 0: # Matchup upstream and downstream with specimen identities