Skip to content
This repository has been archived by the owner on Mar 20, 2020. It is now read-only.

Commit

Permalink
#26 Added xrange iterators
Browse files Browse the repository at this point in the history
  • Loading branch information
josiahseaman committed Sep 4, 2019
1 parent 90570c4 commit eba726d
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 81 deletions.
2 changes: 1 addition & 1 deletion Graph/gfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def from_graph(cls, graph: GraphGenome): # TODO: should be given ZoomLevel inst
node_series = ",".join(visits)
connections = ",".join(['*'] * path.nodes.count()) # count -1?
gfa.add_line('\t'.join(['P', path.accession, node_series, connections]))
for node in graph.nucleotide_level.nodes: # in no particular order
for node in graph.nucleotide_level.nodes_xrange(): # in no particular order
gfa.add_line('\t'.join(['S', str(node.name), node.seq]))
return cls(gfa, "from Graph")

Expand Down
17 changes: 17 additions & 0 deletions Graph/migrations/0008_remove_path_summarized_by.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 2.2.1 on 2019-09-03 16:59

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('Graph', '0007_NodeTraversal_unique_together_order'),
]

operations = [
migrations.RemoveField(
model_name='path',
name='summarized_by',
),
]
45 changes: 30 additions & 15 deletions Graph/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,17 @@ def create(self, graph, zoom, blank_layer=False) -> 'ZoomLevel':
return me # We've done all the necessary work
# Copy previous level in entirety
previous_level = graph.zoomlevel_set.get(zoom=zoom - 1)
Node.objects.bulk_create([Node(name=n.name, zoom=me) for n in previous_level.nodes], 300)
Node.objects.bulk_create([Node(name=n.name, zoom=me) for n in previous_level.nodes_xrange()], 100)
# TODO: This loop can be sped up by bulk_create and bulk_update
start = previous_level.paths.aggregate(Max('id'))['rating__max'] # TODO make lazy_paths() generator method
stop = previous_level.paths.aggregate(Min('id'))['rating__min']
for path_id in range(start, stop):
if Path.objects.exists(id=path_id):
path = Path.objects.get(id=path_id)
name = path.name
p = Path(accession=name, zoom=me) # new Path for a new level
p.save()
# TODO: this is REALLY SLOW AND WASTEFUL!
# Makes a full copy of every traversal in the Path so new copies can be edited
copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order)
for traverse in path.nodetraversal_set.all()]
NodeTraversal.objects.bulk_create(copies, 100)
for path in previous_level.path_xrange():
name = path.name
p = Path(accession=name, zoom=me) # new Path for a new level
p.save()
# TODO: this is REALLY SLOW AND WASTEFUL!
# Makes a full copy of every traversal in the Path so new copies can be edited
copies = [NodeTraversal(path=p, node=traverse.node, strand=traverse.strand, order=traverse.order)
for traverse in path.nodetraversal_set.all()]
NodeTraversal.objects.bulk_create(copies, 100)
return me


Expand Down Expand Up @@ -88,6 +84,25 @@ def nodes(self) -> QuerySet:
def node(self, node_name):
return Node.objects.get(name=node_name, zoom=self)

def path_xrange(self):
"""Lazy evaluation of Paths to ensure that we don't overrun SQL"""
start = self.paths.aggregate(Min('id'))['id__min']
stop = self.paths.aggregate(Max('id'))['id__max'] + 1
for path_id in range(start, stop):
try:
yield Path.objects.get(id=path_id)
except Path.DoesNotExist:
pass # path ids sometimes will have gaps

def nodes_xrange(self):
"""Lazy evaluation of Nodes to ensure that we don't overrun SQL"""
start = self.nodes.aggregate(Min('id'))['id__min']
stop = self.nodes.aggregate(Max('id'))['id__max'] + 1
for path_id in range(start, stop):
try:
yield Node.objects.get(id=path_id)
except Node.DoesNotExist:
pass # node ids sometimes will have gaps


class GraphManager(models.Manager):
Expand Down Expand Up @@ -301,7 +316,7 @@ def summary_child(self):
if self.zoom.zoom == 0:
return None
previous_zoom = self.zoom.graph.zoomlevel_set.get(zoom=self.zoom.zoom - 1)
return previous_zoom.node_set.get(accession=self.accession)
return previous_zoom.path_set.get(accession=self.accession)

def append_gfa_nodes(self, nodes):
assert hasattr(nodes[0], 'orient') and hasattr(nodes[0], 'name'), 'Expecting gfapy.Gfa.path'
Expand Down
12 changes: 6 additions & 6 deletions Graph/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,10 @@ def test_graph_factory(self):

def test_summary_storage(self):
graph = self.test_example_graph()

zoom0 = graph.nucleotide_level
zoom1 = ZoomLevel.objects.create(graph=graph, zoom=1, blank_layer=False)
path1 = zoom1.paths.filter(accession='a').first()
path1 = zoom1.paths.get(accession='a')
self.assertEqual(zoom1.paths.count(), 5)
new_node = Node.objects.create(seq='ACGTCGGA', name='2*2', zoom=zoom1)
base_nodes = graph.nucleotide_level.nodes
base_nodes.filter(name__in=['1', '2', '4']).update(summarized_by=new_node)
Expand All @@ -72,11 +73,10 @@ def test_summary_storage(self):
self.assertTrue(path1.summary_child), # "Path should be linked to its child."
self.assertEqual(zoom1.paths.count(), 5)
# ZoomLevel
zoom0 = graph.nucleotide_level
self.assertEqual(len(zoom1), len(zoom0) - 2)
self.assertEqual(zoom1.node_ids(),set(range(23, 42)),)#{23,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41}
self.assertEqual(len(zoom1), len(zoom0) - 3 + 1)
self.assertEqual(zoom1.node_ids(),set(range(23, 42)) - {24},) # 24 deleted
self.assertEqual(zoom0.node_ids(), set(range(1,21)))
names = [x.name for x in zoom1.nodes]
names = zoom1.nodes.values_list('name', flat=True)
self.assertEqual(names, ['5', '6', '2*2'])
sequences = [x.seq for x in zoom1.nodes]
self.assertEqual(sequences, ['C', 'AGTACG', 'ACGTCGGA'])
Expand Down
86 changes: 27 additions & 59 deletions HaploBlocker/haplonetwork.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,34 +103,6 @@ def populate_transitions(simplified_individuals):
# node.upstream[Node.NOTHING] += 1


def update_transition(node):
"""Only transition values for nodes already listed in upstream and downstream will be calculated."""
if not node.is_nothing():
update_stream_transitions(node, 'upstream')
update_stream_transitions(node, 'downstream')

return node


def update_stream_transitions(node, stream):
"""This will updated either upstream or downstream transition counts based on the
the value of 'stream'. This is a meta-programming function that requires the exact
name of the class field 'upstream' or 'downstream' to work."""
g = getattr #
running = g(node, stream).keys()
setattr(node, stream, defaultdict(lambda: 0))
for n in running:
if not n.is_nothing():
g(node, stream)[n] = len(node.specimens.intersection(n.specimens))
accounted_upstream = sum(g(node, stream).values()) - g(node, stream)[Node.NOTHING]
g(node, stream)[Node.NOTHING] = len(node.specimens) - accounted_upstream
assert all([count > -1 for count in g(node, stream).values()]), node.details()
# Cleans up old keys including Node.NOTHING
to_be_deleted = {key for key, count in g(node, stream).items() if count == 0}
for key in to_be_deleted:
g(node, stream).pop(key, None)


def simple_merge(current_level: ZoomLevel) -> bool:
""" Side effects full_graph by merging any consecutive nodes that have
identical specimens and removing the redundant my_node from full_graph.
Expand All @@ -150,35 +122,31 @@ def simple_merge(current_level: ZoomLevel) -> bool:
# Node.objects.filter(path_set == )
modification_happened = False
path_ids = current_level.paths.values_list('id', flat=True)
for node_id in range(1, current_level.node_set.order_by('-id').first().id):
try:
my_node = Node.objects.get(id=node_id, zoom=current_level)

# only one Node Downstream, no matter the number of specimens
if len(my_node.downstream_ids()) == 1:
d = my_node.nodetraversal_set.first().downstream()
if d:
modification_happened = True
next_node = d.node # fetched from DB
if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee...
# Torsten deletes my_node and modifies next_node
merged_node = Node.objects.create(name=f'{my_node.name}*{current_level.zoom}',
zoom=current_level)
for x in [my_node, next_node]:
x.summarized_by = merged_node
x.save() # TODO: doesn't work because reading and writing same layer. next_node gets deleted soon

# edit existing traversals
next_node.nodetraversal_set.\
filter(path_id__in=path_ids).\
update(node_id=merged_node.id)

# delete my_node and all associates
query = my_node.nodetraversal_set.filter(path_id__in=path_ids)
query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/
# TODO: merged_node.start = my_node.start, length = my_node.length + next_node.length
except Node.DoesNotExist:
pass # node ids are not entirely dense
for my_node in current_level.nodes_xrange():

# only one Node Downstream, no matter the number of specimens
if len(my_node.downstream_ids()) == 1:
d = my_node.nodetraversal_set.first().downstream()
if d:
modification_happened = True
next_node = d.node # fetched from DB
if my_node.nodetraversal_set.count() == next_node.nodetraversal_set.count(): # Not a complete guarantee...
# Torsten deletes my_node and modifies next_node
merged_node = Node.objects.create(name=f'{my_node.name}*{current_level.zoom}',
zoom=current_level)
for x in [my_node, next_node]:
x.summarized_by = merged_node
x.save() # TODO: doesn't work because reading and writing same layer. next_node gets deleted soon

# edit existing traversals
next_node.nodetraversal_set.\
filter(path_id__in=path_ids).\
update(node_id=merged_node.id)

# delete my_node and all associates
query = my_node.nodetraversal_set.filter(path_id__in=path_ids)
query._raw_delete(query.db) # https://www.nickang.com/fastest-delete-django/
# TODO: merged_node.start = my_node.start, length = my_node.length + next_node.length
return modification_happened


Expand All @@ -205,7 +173,7 @@ def neglect_nodes(zoom_level : ZoomLevel, deletion_cutoff=FILTER_THRESHOLD):

# next_level, zoom = prep_next_summary_layer(current_level)

for node in zoom_level.nodes: # TODO optimize distinct count
for node in zoom_level.nodes_xrange(): # TODO optimize distinct count
if len(node.specimens()) <= deletion_cutoff:
delete_node(node, deletion_cutoff, zoom_level)

Expand Down Expand Up @@ -254,7 +222,7 @@ def split_groups(zoom_level: ZoomLevel):
TODO: Ideally, the database would retain some record of how many nucleotides are shared between
the two new haplotype nodes."""

for node in zoom_level.nodes:
for node in zoom_level.nodes_xrange():
# check if all transition upstream match with one of my downstream nodes
if len(node.specimens()) > 0:
# Matchup upstream and downstream with specimen identities
Expand Down

0 comments on commit eba726d

Please sign in to comment.