diff --git a/Graph/gfa.py b/Graph/gfa.py index c8250c8..e3f2e26 100644 --- a/Graph/gfa.py +++ b/Graph/gfa.py @@ -60,8 +60,9 @@ def topologicalSort(self): class GFA: - def __init__(self, gfa: gfapy.Gfa): + def __init__(self, gfa: gfapy.Gfa, source_path: str): self.gfa = gfa + self.source_path = source_path # @classmethod # def load_from_pickle(cls, file: str): @@ -77,14 +78,14 @@ def load_from_xg(cls, file: str, xg_bin: str): process.wait() if process.returncode != 0: raise OSError() - graph = cls(gfa) + graph = cls(gfa, file) process.stdout.close() return graph @classmethod def load_from_gfa(cls, file: str): gfa = gfapy.Gfa.from_file(file) - graph = cls(gfa) + graph = cls(gfa, file) return graph # def save_as_pickle(self, outfile: str): @@ -111,43 +112,41 @@ def from_graph(cls, graph: Graph): gfa.add_line('\t'.join(['P', path.accession, node_series, ",".join(['*' for _ in path.nodes])])) for node in graph.nodes.values(): # in no particular order gfa.add_line('\t'.join(['S', str(node.id), node.seq])) - return cls(gfa) + return cls(gfa, "from Graph") - @property def to_paths(self) -> List[Path]: - node_hash = {} + # create parent object for this genome + gdb = GraphGenome.objects.get_or_create(name=self.source_path)[0] for segment in self.gfa.segments: - node_id = segment.name + "+" - node = Node(segment.sequence) - node_hash[node_id] = node - - node_id = segment.name + "-" - node = Node(segment.sequence) - node_hash[node_id] = node + node_id = segment.name + Node.objects.get_or_create(seq=segment.sequence, name=node_id, graph=gdb) paths = [] for path in self.gfa.paths: - nodes = [] - for node in path.segment_names: - node_index = NodeTraversal(Node(node_hash[node.name + node.orient].seq, node.name), node.orient) - nodes.append(node_index) - paths.append(Path(path.name, nodes)) - + p = Path(path.name, graph=gdb).save() + p.append_gfa_nodes(path.segment_names) + paths.append(p) + # path_names = [path.name for path in self.gfa.paths] + # list(Path.objects.get(name__in=path_names)) return paths - @property - def to_graph(self): + def to_graph(self) -> GraphGenome: + paths = self.to_paths() + if paths: + return paths[0].graph + else: + return None + # Extract all paths into graph - path_names = [p.name for p in self.gfa.paths] - graph = Graph(path_names) # Paths can be empty at start - for path in self.gfa.paths: - for node in path.segment_names: - graph.append_node_to_path(node.name, node.orient, path.name) - for segment in self.gfa.segments: - graph.nodes[segment.name].seq = segment.sequence - graph.paths = self.to_paths - return graph - # IMPORTANT: It's not clear to Josiah how much of the below is necessary, so it's being left unmodified. + # path_names = [p.name for p in self.gfa.paths] + # graph = Graph(path_names) # Paths can be empty at start + # for path in self.gfa.paths: + # for path_index, node in enumerate(path.segment_names): + # graph.append_node_to_path(node.name, node.orient, path.name, path_index) + # for segment in self.gfa.segments: + # graph.nodes[segment.name].seq = segment.sequence + # graph.paths = self.to_paths() + # return graph ''' diff --git a/Graph/migrations/0001_initial.py b/Graph/migrations/0001_initial.py new file mode 100644 index 0000000..0df4016 --- /dev/null +++ b/Graph/migrations/0001_initial.py @@ -0,0 +1,51 @@ +# Generated by Django 2.2.1 on 2019-08-14 14:28 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='GraphGenome', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=1000)), + ], + ), + migrations.CreateModel( + name='Node', + fields=[ + ('seq', models.CharField(blank=True, max_length=255)), + ('name', models.CharField(max_length=15, primary_key=True, serialize=False)), + ('graph', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='Graph.GraphGenome')), + ], + options={ + 'unique_together': {('graph', 'name')}, + }, + ), + migrations.CreateModel( + name='Path', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('accession', models.CharField(max_length=1000, unique=True)), + ('graph', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='Graph.GraphGenome')), + ], + ), + migrations.CreateModel( + name='NodeTraversal', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('order', models.IntegerField(help_text='Defines the order a path lists traversals')), + ('strand', models.CharField(choices=[('+', '+'), ('-', '-')], default='+', max_length=1)), + ('node', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='Graph.Node')), + ('path', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='Graph.Path')), + ], + ), + ] diff --git a/Graph/models.py b/Graph/models.py index 2a15990..2c338c9 100644 --- a/Graph/models.py +++ b/Graph/models.py @@ -5,10 +5,11 @@ from uuid import uuid1 from django.db import models + from Graph.utils import keydefaultdict from Utils.models import CustomSaveManager - +# GraphGenome specific error classes for more informative error catching class NoAnchorError(ValueError): pass class PathOverlapError(ValueError): @@ -23,35 +24,27 @@ class GraphGenome(models.Model): name = models.CharField(max_length=1000) -class DoubleNode(): - plus = Node - minus = Node - - def visitors(self): - plus_node_set.union(minus_node_set) - - class Node(models.Model): seq = models.CharField(max_length=255, blank=True) - name = models.CharField(primary_key=True) + name = models.CharField(primary_key=True, max_length=15) graph = models.ForeignKey(GraphGenome, on_delete=models.CASCADE) class Meta: unique_together = ['graph', 'name'] - def __len__(self): - return len(self.paths) - - def __repr__(self): - """Paths representation is sorted because set ordering is not guaranteed.""" - return repr(self.seq) + \ - ', {' + ', '.join(str(i) for i in list(self.paths)) + '}' - - def __eq__(self, other): - if not isinstance(other, Node): - print("Warn: comparing Node and ", type(other), other) - return False - return self.seq == other.seq and self.paths == other.paths # and self.id == other.id + # def __len__(self): + # return nodetraversal_set.count() + # + # def __repr__(self): + # """Paths representation is sorted because set ordering is not guaranteed.""" + # return repr(self.seq) + \ + # ', {' + ', '.join(str(i) for i in list(self.paths)) + '}' + # + # def __eq__(self, other): + # if not isinstance(other, Node): + # print("Warn: comparing Node and ", type(other), other) + # return False + # return self.seq == other.seq and self.paths == other.paths # and self.id == other.id def __hash__(self): return hash(self.seq) @@ -64,20 +57,20 @@ def __hash__(self): def to_gfa(self, segment_id: int): return '\t'.join(['S', str(segment_id), self.seq]) - # Typing is picky about order of declaration, but strings bypass this PEP484 - def merge_minor(self, minor_allele: 'Node') -> 'Node': - m = Node(self.seq, self.paths.union(minor_allele.paths)) - # TODO: penalize paths with nucleotide mismatch - return m - - def intersection(self, downstream: 'Node') -> 'Node': - m = Node(self.seq + downstream.seq, - self.paths.intersection(downstream.paths)) - return m - - def union(self, downstream: 'Node') -> 'Node': - return Node(self.seq + downstream.seq, - self.paths.union(downstream.paths)) + # # Typing is picky about order of declaration, but strings bypass this PEP484 + # def merge_minor(self, minor_allele: 'Node') -> 'Node': + # m = Node(self.seq, self.paths.union(minor_allele.paths)) + # # TODO: penalize paths with nucleotide mismatch + # return m + # + # def intersection(self, downstream: 'Node') -> 'Node': + # m = Node(self.seq + downstream.seq, + # self.paths.intersection(downstream.paths)) + # return m + # + # def union(self, downstream: 'Node') -> 'Node': + # return Node(self.seq + downstream.seq, + # self.paths.union(downstream.paths)) class Slice: def __init__(self, nodes: Iterable[Node]): @@ -138,10 +131,7 @@ class Path(models.Model): sequences is the accession's genome. Create Paths first from accession names, then append them to Nodes to link together.""" accession = models.CharField(unique=True, max_length=1000) # one path per accession - - # def __init__(self, accession: str, nodes = []): - # # self.nodes = nodes # List[NodeTraversal] - # self.position_checkpoints = {} # TODO: currently not used + graph = models.ForeignKey(GraphGenome, on_delete=models.CASCADE) def __getitem__(self, path_index): return self.nodes[path_index] @@ -158,19 +148,24 @@ def __hash__(self): @property def nodes(self): - return NodeTraversal.objects.get(path=self)#.order_by('order') + return NodeTraversal.objects.get(path=self).order_by('order') + + def append_gfa_nodes(self, nodes): + assert hasattr(nodes[0], 'orient') and hasattr(nodes[0], 'name'), 'Expecting gfapy.Gfa.path' + for node in nodes: + NodeTraversal(node=Node.objects.get(name=node.name), + path=self, strand=node.orient).save() - def append_node(self, node: Node, strand: str): + def append_node(self, node: Node, path_index, strand: str): """This is the preferred way to build a graph in a truly non-linear way. NodeTraversal is appended to Path (order dependent) and PathIndex is added to Node (order independent).""" - NodeTraversal(node, self, strand).save() - return node + NodeTraversal(node, self, strand, path_index).save() - @classmethod - def build(cls, name: str, seq_of_nodes: List[str]): - node = Node.objects.create(seq) - for p in paths: - NodeTraversal.objects.create(node, path) + # @classmethod + # def build(cls, name: str, seq_of_nodes: List[str]): + # node = Node.objects.create(seq) + # for p in paths: + # NodeTraversal.objects.create(node, path) def name(self): return self.accession @@ -183,10 +178,11 @@ def to_gfa(self): class NodeTraversal(models.Model): """Link from a Path to a Node it is currently traversing. Includes strand""" - node = models.ForeignKey(Node, index=True, on_delete=models.CASCADE) - path = models.ForeignKey(Path, index=True, on_delete=models.CASCADE, help_text='') - order = models.IntegerField(help_text='Defines the order a path lists traversals') + node = models.ForeignKey(Node, db_index=True, on_delete=models.CASCADE) + path = models.ForeignKey(Path, db_index=True, on_delete=models.CASCADE, help_text='') strand = models.CharField(choices=[('+', '+'),('-', '-')], default='+', max_length=1) + order = models.IntegerField(help_text='Defines the order a path lists traversals') # set automatically + objects = CustomSaveManager() def __repr__(self): @@ -200,13 +196,13 @@ def __eq__(self, other): return self.node.id == other.node.id and self.strand == other.strand def save(self, **kwargs): - """IMPORTANT NOTE: save() does not get called if you do NodeTraverseal.objects.create + """Checks the largest 'order' value in the current path and increments by 1. + IMPORTANT NOTE: save() does not get called if you do NodeTraverseal.objects.create or get_or_create""" - self.order = self.path.nodetraversal_set.all().order_by('-order').first().order + 1 + # self.order = self.path.nodetraversal_set.all().order_by('-order').first().order + 1 super(NodeTraversal, self).save(**kwargs) - class Graph: def __init__(self, paths: Iterable = None): # This can create orphan Nodes with no traversals @@ -243,7 +239,7 @@ def save_as_xg(self, file: str, xg_bin: str): gfa = GFA.from_graph(self) gfa.save_as_xg(file, xg_bin) - def append_node_to_path(self, node_id, strand, path_name): + def append_node_to_path(self, node_id, strand, path_name, path_index): """This is the preferred way to build a graph in a truly non-linear way. Nodes will be created if necessary. NodeTraversal is appended to Path (order dependent) and PathIndex is added to Node @@ -253,7 +249,7 @@ def append_node_to_path(self, node_id, strand, path_name): self.nodes[node_id] = Node('', [], node_id) else: raise ValueError("Provide the id of the node, not", node_id) - self.paths[path_name].append_node(self.nodes[node_id], strand) + self.paths[path_name].append_node(self.nodes[node_id], path_index, strand) # # def compute_slices(self): # """Alias: Upgrades a Graph to a SlicedGraph""" diff --git a/Graph/test.py b/Graph/test.py index f39adb2..1c8a9f6 100644 --- a/Graph/test.py +++ b/Graph/test.py @@ -92,7 +92,7 @@ class DAGifyTest(unittest.TestCase): def test_dagify(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test.gfa")) - paths = gfa.to_paths + paths = gfa.to_paths() dagify = DAGify(paths) profile = dagify.generate_profiles(0) slices = dagify.to_slices(profile) @@ -103,7 +103,7 @@ def test_dagify(self): def test_dagify2(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test2.gfa")) - paths = gfa.to_paths + paths = gfa.to_paths() dagify = DAGify(paths) profile = dagify.generate_profiles(0) # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) @@ -112,7 +112,7 @@ def test_dagify2(self): def test_dagify3(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test3.gfa")) - paths = gfa.to_paths + paths = gfa.to_paths() dagify = DAGify(paths) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() self.assertEqual(rep_count, 1) @@ -121,7 +121,7 @@ def test_dagify3(self): def test_dagify_altpath(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "alternate_paths.gfa")) - paths = gfa.to_paths + paths = gfa.to_paths() dagify = DAGify(paths) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() self.assertEqual(rep_count, 1) @@ -130,7 +130,7 @@ def test_dagify_altpath(self): def test_dagify_dup(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "duplicate.gfa")) - paths = gfa.to_paths + paths = gfa.to_paths() dagify = DAGify(paths) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() self.assertEqual(rep_count, 2) @@ -140,7 +140,7 @@ def test_dagify_dup(self): def test_unresolved_repreat(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "unresolved_repeat.gfa")) - paths = gfa.to_paths + paths = gfa.to_paths() dagify = DAGify(paths) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) @@ -149,7 +149,7 @@ def test_unresolved_repreat(self): @unittest.skip("Inversion is unsupported") def test_inversion(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "inversion.gfa")) - paths = gfa.to_paths + paths = gfa.to_paths() dagify = DAGify(paths) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) @@ -158,7 +158,7 @@ def test_inversion(self): @unittest.skip("Inversion is unsupported") def test_nested_inversion(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "nested_inv.gfa")) - paths = gfa.to_paths + paths = gfa.to_paths() dagify = DAGify(paths) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) @@ -167,7 +167,7 @@ def test_nested_inversion(self): @unittest.skip("Inversion is unsupported") def test_simple_inversion(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "simple_inv.gfa")) - paths = gfa.to_paths + paths = gfa.to_paths() dagify = DAGify(paths) profile, rep_count = dagify.generate_profiles_with_minimizing_replications() # graph = SlicedGraph.load_from_slices(dagify.to_slices(profile), paths) @@ -216,7 +216,7 @@ def test_load_gfa_to_graph(self): def make_graph_from_gfa(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test.gfa")) - graph = gfa.to_graph + graph = gfa.to_graph() return graph, gfa def test_export_as_gfa(self): @@ -226,7 +226,7 @@ def test_export_as_gfa(self): def test_load_gfa_to_graph_2(self): gfa = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test2.gfa")) - graph = gfa.to_graph + graph = gfa.to_graph() self.assertIsNotNone(graph) @unittest.skipIf(not os.path.isfile(location_of_xg), "XG binary is not found.") @@ -234,7 +234,7 @@ def test_load_gfa_via_xg(self): graph = GFA.load_from_gfa(join(PATH_TO_TEST_DATA, "test.gfa")) graph.save_as_xg(join(PATH_TO_TEST_DATA, "test.xg"), location_of_xg) graph2 = GFA.load_from_xg(join(PATH_TO_TEST_DATA, "test.xg"), location_of_xg) - graph = graph2.to_graph + graph = graph2.to_graph() # graph = SlicedGraph.from_graph(graph) # x = 'x' # y = 'y' diff --git a/requirements_dev.txt b/requirements_dev.txt index e69222f..c9d99b7 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -4,6 +4,7 @@ Django==2.2.1 sqlparse==0.3.0 numpy==1.16.2 networkx==2.2 +dataclasses==0.6 # To setup Django database file run python manage.py migrate. db.sqlite3 is not included in the repo. \ No newline at end of file diff --git a/test_data/test.gfa b/test_data/test.gfa index 2b1f4a2..2f9efd6 100644 --- a/test_data/test.gfa +++ b/test_data/test.gfa @@ -1,5 +1,5 @@ H VN:Z:1.0 -P x 1+,1+,1+,1-,3+,5+,6+,8+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* +P x 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* P y 1+,2+,5+,6+,8+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* P z 1+,2+,5+,6+,7+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* S 1 CAAATAAG