From e408e0fbbaef959faaa75a8839182c198848c2d1 Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Tue, 9 Jul 2019 16:13:57 +0900 Subject: [PATCH 01/14] #4: Add DAGify method for linearizing the order of nodes --- src/gfa.py | 22 ++++++++ src/graph.py | 9 ++-- src/sort.py | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/test.py | 19 +++++++ 4 files changed, 185 insertions(+), 5 deletions(-) create mode 100644 src/sort.py diff --git a/src/gfa.py b/src/gfa.py index f4e64ba..8d84bc9 100644 --- a/src/gfa.py +++ b/src/gfa.py @@ -120,6 +120,28 @@ def from_graph(cls, graph: Graph): gfa.add_line('\t'.join(['P', path_key, "+,".join(path_values)+"+", ",".join(['*' for _ in path_values])])) return cls(gfa) + @property + def to_paths(self) -> List[Path]: + node_hash = {} + for segment in self.gfa.segments: + node_id = segment.name + "+" + node = Node(segment.sequence, []) + node_hash[node_id] = node + + node_id = segment.name + "-" + node = Node(segment.sequence, []) + node_hash[node_id] = node + + paths = [] + for path in self.gfa.paths: + nodes = [] + for node in path.segment_names: + node_index = NodeIndex(Node(node_hash[node.name + node.orient].seq, [], node.name), node.orient) + nodes.append(node_index) + paths.append(Path(path.name, nodes)) + + return paths + @property def to_graph(self): topological_sort_helper = TopologicalSort() diff --git a/src/graph.py b/src/graph.py index 68d5001..e2b0af7 100644 --- a/src/graph.py +++ b/src/graph.py @@ -13,11 +13,12 @@ class NodeMissingError(ValueError): pass class Node: - def __init__(self, seq: str, paths: Iterable[int]): + def __init__(self, seq: str, paths: Iterable[int], index: int = 0): assert isinstance(seq, str), seq assert not isinstance(paths, str) and isinstance(paths, Iterable), paths self.seq = seq self.paths = set(paths) + self.index = index def __len__(self): return len(self.paths) @@ -118,12 +119,10 @@ def __init__(self, name: str, nodes: List[NodeIndex]): self.nodes = nodes self.position_checkpoints = {} - def __getitem__(self, i): - return self.nodes[i] - def __repr__(self): """Warning: the representation strings are very sensitive to whitespace""" - return self.nodes.__repr__() + #return self.nodes.__repr__() + return self.name def to_gfa(self): return '\t'.join(['P', self.name, "+,".join([x.node.name + x.strand for x in self.nodes])+"+", ",".join(['*' for x in self.nodes])]) diff --git a/src/sort.py b/src/sort.py new file mode 100644 index 0000000..791f4f1 --- /dev/null +++ b/src/sort.py @@ -0,0 +1,140 @@ +from src.graph import * + +import dataclasses + +@dataclasses.dataclass +class Profile: + node: NodeIndex + paths: List[Path] + duplicate: int = 0 + + +class DAGify: + def __init__(self, paths: List[Path], nodes = {}): + """ + + :type paths: List[Path] + """ + self.paths = paths + self.nodes = nodes + self.profile = [] + + # def random_search_to_minimize_node_replication(self): + + + def recursive_merge(self, primary_path_index: int = 0): + profile = [] + for node_index in self.paths[primary_path_index].nodes: + profile.append(Profile(node_index, [self.paths[primary_path_index]], 0)) + for i, path in enumerate(self.paths): + if i == primary_path_index: + continue + profile = self.lcs(profile, path) + return profile + + def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: + n, m = len(s1), len(s2.nodes) + dp = [[0] * (m+1) for _ in range(n+1)] + + for i in range(1, n + 1): + for j in range(1, m + 1): + if s1[i-1].node == s2.nodes[j-1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + i, j = n, m + index = [] + prev = set() + + while i > 0 or j > 0: + if s1[i-1].node == s2.nodes[j-1]: + prev_paths = s1[i-1].paths + prev_paths.append(s2) + index.append(Profile(s1[i-1].node, prev_paths, s1[i-1].node in prev)) + prev.add(s1[i-1].node) + i -= 1 + j -= 1 + elif dp[i-1][j] > dp[i][j-1]: + prev_paths = s1[i-1].paths + index.append(Profile(s1[i-1].node, prev_paths, s1[i-1].node in prev)) + prev.add(s1[i-1].node) + i -= 1 + else: + index.append(Profile(s2.nodes[j-1], [s2], False)) + prev.add(s2.nodes[j-1]) + j -= 1 + + while i > 0: + prev_paths = s1[i - 1].paths + index.append(Profile(s1[i - 1].node, prev_paths, s1[i - 1].node in prev)) + prev.add(s1[i - 1].node) + i -= 1 + + while j > 0: + prev.add(s2.nodes[j - 1]) + index.append(Profile(s2.nodes[j - 1], [s2], False)) + j -= 1 + + index.reverse() + self.profile = index + + return index + + def to_graph(self): + factory_input = [] + current_slice = Slice([]) + for prof in self.profile: + paths = [x.name for x in prof.paths] + if len(prof.paths) == len(self.paths): + if len(current_slice.nodes) > 0: + factory_input.append(current_slice) + factory_input.append(Slice([Node(prof.node.node.seq, paths, prof.node.node.index)])) + current_slice = Slice([]) + else: + all_set = set() + for items in [x.paths for x in current_slice.nodes]: + all_set = all_set | items + if set(prof.paths) & all_set != set(): + if len(current_slice.nodes) > 0: + current_slice.add_node(Node("", set([x.name for x in self.paths]) - all_set)) + factory_input.append(current_slice) + current_slice = Slice([Node(prof.node.node.seq, paths, prof.node.node.index)]) + else: + current_slice.add_node(Node(prof.node.node.seq, paths, prof.node.node.index)) + + base_graph = Graph.load_from_slices(factory_input) + print(factory_input) + return base_graph + + def merge(A: List[NodeIndex], B: List[NodeIndex]): + pos, merged = [], [] + pi, pj, prev = 0, 0, set() + for i in range(len(A)): + for j in range(len(B)): + if pi <= i and pj <= j and A[i] == B[j]: + curr = set() + while pi < i: + curr.add(A[pi]) + pos.append( (pi, -1, A[pi] in prev) ) + merged.append(A[pi]) + pi += 1 + while pj < j: + curr.add(B[pj]) + pos.append( (-1, pj, B[pj] in prev) ) + merged.append(B[pj]) + pj += 1 + if i == pi and j == pj: + pos.append((i, j, False)) + merged.append(A[i]) + pi += 1 + pj += 1 + prev |= curr + while pi < len(A): + pos.append( (pi, -1, A[pi] in prev) ) + merged.append(A[pi]) + pi += 1 + while pj < len(B): + pos.append( (-1, pj, B[pj] in prev) ) + merged.append(B[pj]) + pj += 1 + return pos, merged \ No newline at end of file diff --git a/src/test.py b/src/test.py index 7e62f8f..b80b3f6 100644 --- a/src/test.py +++ b/src/test.py @@ -1,6 +1,7 @@ import unittest from src.gfa import GFA from src.graph import Graph, Slice, Node, NoAnchorError, PathOverlapError, NoOverlapError, NodeMissingError +from src.sort import DAGify def G(rep): """Short hand for Graph construction that returns a slice""" @@ -56,6 +57,24 @@ def test_G(self): G([['C', {1, 2, 3, 4}], ['T', {12, 16}]]) +class DAGifyTest(unittest.TestCase): + """ test class of gfa.py + """ + + def test_dagify(self): + gfa = GFA.load_from_gfa("../test/test.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + dagify.recursive_merge(0) + graph = dagify.to_graph() + + graph_by_toplogical_sort = gfa.to_graph + x = 'x' + y = 'y' + z = 'z' + self.assertEqual(graph, graph_by_toplogical_sort) + + class GFATest(unittest.TestCase): """ test class of gfa.py """ From 8e0faab24ae5097ae4e8f9ad9e3b4b2aeef63570 Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Tue, 9 Jul 2019 16:36:53 +0900 Subject: [PATCH 02/14] #4: Debug of dp conditions --- src/sort.py | 17 +++++++++-------- src/test.py | 12 ++++++++++++ test/test2.gfa | 2 +- test/test3.gfa | 38 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 test/test3.gfa diff --git a/src/sort.py b/src/sort.py index 791f4f1..9b05112 100644 --- a/src/sort.py +++ b/src/sort.py @@ -46,28 +46,28 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: index = [] prev = set() - while i > 0 or j > 0: + while i > 0 and j > 0: if s1[i-1].node == s2.nodes[j-1]: prev_paths = s1[i-1].paths prev_paths.append(s2) - index.append(Profile(s1[i-1].node, prev_paths, s1[i-1].node in prev)) - prev.add(s1[i-1].node) + index.append(Profile(s1[i-1].node, prev_paths, s1[i-1].node.node.index in prev)) + prev.add(s1[i-1].node.node.index) i -= 1 j -= 1 elif dp[i-1][j] > dp[i][j-1]: prev_paths = s1[i-1].paths - index.append(Profile(s1[i-1].node, prev_paths, s1[i-1].node in prev)) - prev.add(s1[i-1].node) + index.append(Profile(s1[i-1].node, prev_paths, s1[i-1].node.node.index in prev)) + prev.add(s1[i-1].node.node.index) i -= 1 else: index.append(Profile(s2.nodes[j-1], [s2], False)) - prev.add(s2.nodes[j-1]) + prev.add(s2.nodes[j-1].node.index) j -= 1 while i > 0: prev_paths = s1[i - 1].paths - index.append(Profile(s1[i - 1].node, prev_paths, s1[i - 1].node in prev)) - prev.add(s1[i - 1].node) + index.append(Profile(s1[i - 1].node, prev_paths, s1[i - 1].node.node.index in prev)) + prev.add(s1[i - 1].node.node.index) i -= 1 while j > 0: @@ -83,6 +83,7 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: def to_graph(self): factory_input = [] current_slice = Slice([]) + print(self.profile) for prof in self.profile: paths = [x.name for x in prof.paths] if len(prof.paths) == len(self.paths): diff --git a/src/test.py b/src/test.py index b80b3f6..36c525a 100644 --- a/src/test.py +++ b/src/test.py @@ -74,6 +74,18 @@ def test_dagify(self): z = 'z' self.assertEqual(graph, graph_by_toplogical_sort) + def test_dagify2(self): + gfa = GFA.load_from_gfa("../test/test2.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + dagify.recursive_merge(0) + graph = dagify.to_graph() + + graph_by_toplogical_sort = gfa.to_graph + x = 'x' + y = 'y' + z = 'z' + self.assertEqual(graph, graph_by_toplogical_sort) class GFATest(unittest.TestCase): """ test class of gfa.py diff --git a/test/test2.gfa b/test/test2.gfa index 52164c1..3d206ef 100644 --- a/test/test2.gfa +++ b/test/test2.gfa @@ -2,7 +2,7 @@ H VN:Z:1.0 P x 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* P y 1+,2+,5+,6+,8+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* P z 1+,2+,4+,6+,7+,9+,10+,12+,14+,15+ *,*,*,*,*,*,*,*,* -P a 12+,13+,15+ *,* +P a 7+,9+,10+ *,* S 1 CAAATAAG L 1 + 2 + 0M L 1 + 3 + 0M diff --git a/test/test3.gfa b/test/test3.gfa new file mode 100644 index 0000000..705218b --- /dev/null +++ b/test/test3.gfa @@ -0,0 +1,38 @@ +H VN:Z:1.0 +P x 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* +P y 1+,15+,5+,6+,8+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* +S 1 CAAATAAG +L 1 + 15 + 0M +L 1 + 3 + 0M +S 2 A +L 2 + 4 + 0M +L 15 + 5 + 0M +S 3 G +L 3 + 4 + 0M +L 3 + 5 + 0M +S 4 T +L 4 + 6 + 0M +S 5 C +L 5 + 6 + 0M +S 6 TTG +L 6 + 7 + 0M +L 6 + 8 + 0M +S 7 A +L 7 + 9 + 0M +S 8 G +L 8 + 9 + 0M +S 9 AAATTTTCTGGAGTTCTAT +L 9 + 10 + 0M +L 9 + 11 + 0M +S 10 A +L 10 + 12 + 0M +S 11 T +L 11 + 12 + 0M +S 12 ATAT +L 12 + 13 + 0M +L 12 + 14 + 0M +S 13 A +L 13 + 15 + 0M +S 14 T +L 14 + 15 + 0M +S 15 CCAACTCTCTG From b88f9a12edc2735c587a807efa60801e67babf99 Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Tue, 9 Jul 2019 23:01:32 +0900 Subject: [PATCH 03/14] #4: Add candidate_paths for store the tips --- src/sort.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/sort.py b/src/sort.py index 9b05112..68447d9 100644 --- a/src/sort.py +++ b/src/sort.py @@ -6,6 +6,7 @@ class Profile: node: NodeIndex paths: List[Path] + candidate_paths: List[Path] duplicate: int = 0 @@ -25,7 +26,7 @@ def __init__(self, paths: List[Path], nodes = {}): def recursive_merge(self, primary_path_index: int = 0): profile = [] for node_index in self.paths[primary_path_index].nodes: - profile.append(Profile(node_index, [self.paths[primary_path_index]], 0)) + profile.append(Profile(node_index, [self.paths[primary_path_index]], [self.paths[primary_path_index]], 0)) for i, path in enumerate(self.paths): if i == primary_path_index: continue @@ -50,6 +51,8 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: if s1[i-1].node == s2.nodes[j-1]: prev_paths = s1[i-1].paths prev_paths.append(s2) + candidate_paths = s1[i-1].candidate_paths + index.append(Profile(s1[i-1].node, prev_paths, s1[i-1].node.node.index in prev)) prev.add(s1[i-1].node.node.index) i -= 1 From 88d1a7b8e9d1ee11f4d2fa8b5f3697d39a60a31e Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Wed, 10 Jul 2019 12:58:21 +0900 Subject: [PATCH 04/14] #4: Fix dagify method for graphs that have a duplication (WIP) --- src/sort.py | 90 +++++++++++++++++++++++------------------------------ src/test.py | 15 +++++++-- 2 files changed, 51 insertions(+), 54 deletions(-) diff --git a/src/sort.py b/src/sort.py index 68447d9..5de1472 100644 --- a/src/sort.py +++ b/src/sort.py @@ -6,8 +6,8 @@ class Profile: node: NodeIndex paths: List[Path] - candidate_paths: List[Path] - duplicate: int = 0 + candidate_paths: set() + duplicate: bool = False class DAGify: @@ -20,13 +20,20 @@ def __init__(self, paths: List[Path], nodes = {}): self.nodes = nodes self.profile = [] - # def random_search_to_minimize_node_replication(self): - - - def recursive_merge(self, primary_path_index: int = 0): + def search_for_minimizing_replications(self) -> (List[Profile], int): + min_rep = len(self.nodes) + profile = [] + for i, _ in enumerate(self.paths): + profile_candidate = self.recursive_merge(i) + if min_rep > sum([x for x in profile if x]): + min_rep = sum([x for x in profile if x]) + profile = profile_candidate + return profile, min_rep + + def recursive_merge(self, primary_path_index: int = 0) -> List[Profile]: profile = [] for node_index in self.paths[primary_path_index].nodes: - profile.append(Profile(node_index, [self.paths[primary_path_index]], [self.paths[primary_path_index]], 0)) + profile.append(Profile(node_index, [self.paths[primary_path_index]], {self.paths[primary_path_index].name}, 0)) for i, path in enumerate(self.paths): if i == primary_path_index: continue @@ -46,36 +53,48 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: i, j = n, m index = [] prev = set() + candidate_path_flag = False while i > 0 and j > 0: if s1[i-1].node == s2.nodes[j-1]: prev_paths = s1[i-1].paths prev_paths.append(s2) candidate_paths = s1[i-1].candidate_paths + candidate_paths.add(s2.name) + candidate_path_flag = True - index.append(Profile(s1[i-1].node, prev_paths, s1[i-1].node.node.index in prev)) + index.append(Profile(s1[i-1].node, prev_paths, candidate_paths, s1[i-1].node.node.index in prev)) prev.add(s1[i-1].node.node.index) i -= 1 j -= 1 elif dp[i-1][j] > dp[i][j-1]: prev_paths = s1[i-1].paths - index.append(Profile(s1[i-1].node, prev_paths, s1[i-1].node.node.index in prev)) + candidate_paths = s1[i-1].candidate_paths + if candidate_path_flag: + candidate_paths.add(s2.name) + index.append(Profile(s1[i-1].node, prev_paths, candidate_paths, s1[i-1].node.node.index in prev)) prev.add(s1[i-1].node.node.index) i -= 1 else: - index.append(Profile(s2.nodes[j-1], [s2], False)) + candidate_paths = {s2.name} + if s1[i]: + candidate_paths |= s1[i].candidate_paths + if s1[i-1]: + candidate_paths |= s1[i-1].candidate_paths + index.append(Profile(s2.nodes[j-1], [s2], candidate_paths, False)) prev.add(s2.nodes[j-1].node.index) j -= 1 while i > 0: prev_paths = s1[i - 1].paths - index.append(Profile(s1[i - 1].node, prev_paths, s1[i - 1].node.node.index in prev)) + prev_candidates = s1[i-1].candidate_paths + index.append(Profile(s1[i - 1].node, prev_paths, prev_candidates, s1[i - 1].node.node.index in prev)) prev.add(s1[i - 1].node.node.index) i -= 1 while j > 0: prev.add(s2.nodes[j - 1]) - index.append(Profile(s2.nodes[j - 1], [s2], False)) + index.append(Profile(s2.nodes[j - 1], [s2], {s2.name}, False)) j -= 1 index.reverse() @@ -86,10 +105,10 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: def to_graph(self): factory_input = [] current_slice = Slice([]) - print(self.profile) + # print(self.profile) for prof in self.profile: paths = [x.name for x in prof.paths] - if len(prof.paths) == len(self.paths): + if len(prof.paths) == len(prof.candidate_paths): if len(current_slice.nodes) > 0: factory_input.append(current_slice) factory_input.append(Slice([Node(prof.node.node.seq, paths, prof.node.node.index)])) @@ -97,48 +116,17 @@ def to_graph(self): else: all_set = set() for items in [x.paths for x in current_slice.nodes]: - all_set = all_set | items - if set(prof.paths) & all_set != set(): + all_set |= items + # print(all_set, prof.candidate_paths, prof.paths, set([x.name for x in prof.paths]) & all_set) + if set([x.name for x in prof.paths]) & all_set != set(): if len(current_slice.nodes) > 0: - current_slice.add_node(Node("", set([x.name for x in self.paths]) - all_set)) + if prof.candidate_paths - all_set != set(): + current_slice.add_node(Node("", prof.candidate_paths - all_set)) factory_input.append(current_slice) current_slice = Slice([Node(prof.node.node.seq, paths, prof.node.node.index)]) else: current_slice.add_node(Node(prof.node.node.seq, paths, prof.node.node.index)) base_graph = Graph.load_from_slices(factory_input) - print(factory_input) + # print(factory_input) return base_graph - - def merge(A: List[NodeIndex], B: List[NodeIndex]): - pos, merged = [], [] - pi, pj, prev = 0, 0, set() - for i in range(len(A)): - for j in range(len(B)): - if pi <= i and pj <= j and A[i] == B[j]: - curr = set() - while pi < i: - curr.add(A[pi]) - pos.append( (pi, -1, A[pi] in prev) ) - merged.append(A[pi]) - pi += 1 - while pj < j: - curr.add(B[pj]) - pos.append( (-1, pj, B[pj] in prev) ) - merged.append(B[pj]) - pj += 1 - if i == pi and j == pj: - pos.append((i, j, False)) - merged.append(A[i]) - pi += 1 - pj += 1 - prev |= curr - while pi < len(A): - pos.append( (pi, -1, A[pi] in prev) ) - merged.append(A[pi]) - pi += 1 - while pj < len(B): - pos.append( (-1, pj, B[pj] in prev) ) - merged.append(B[pj]) - pj += 1 - return pos, merged \ No newline at end of file diff --git a/src/test.py b/src/test.py index 36c525a..d366df6 100644 --- a/src/test.py +++ b/src/test.py @@ -58,7 +58,7 @@ def test_G(self): class DAGifyTest(unittest.TestCase): - """ test class of gfa.py + """ test class of sort.py """ def test_dagify(self): @@ -81,11 +81,20 @@ def test_dagify2(self): dagify.recursive_merge(0) graph = dagify.to_graph() - graph_by_toplogical_sort = gfa.to_graph + a = 'a' x = 'x' y = 'y' z = 'z' - self.assertEqual(graph, graph_by_toplogical_sort) + self.assertEqual(graph, [['CAAATAAG', {x, y, z}], ['G', {x}, 'A', {y, z}], ['C', {x, y}, 'T', {z}], ['TTG', {x, y, z}], ['G', {x, y}, 'A', {a, z}], ['AAATTTTCTGGAGTTCTAT', {a, x, y, z}], ['A', {a, z}, 'T', {x, y}], ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]]) + + def test_dagify3(self): + gfa = GFA.load_from_gfa("../test/test3.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + dagify.recursive_merge(0) + graph = dagify.to_graph() + print(graph) + class GFATest(unittest.TestCase): """ test class of gfa.py From 00148e91cc3d28b7db3d9414cd9cde0097bed417 Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Wed, 10 Jul 2019 17:34:48 +0900 Subject: [PATCH 05/14] #4: Fix dagify method for graphs that have an alternative paths --- src/sort.py | 20 ++++----- src/test.py | 44 +++++++++++++------ ...ernately_paths.gfa => alternate_paths.gfa} | 0 3 files changed, 40 insertions(+), 24 deletions(-) rename test/{alternately_paths.gfa => alternate_paths.gfa} (100%) diff --git a/src/sort.py b/src/sort.py index 5de1472..003b181 100644 --- a/src/sort.py +++ b/src/sort.py @@ -9,31 +9,32 @@ class Profile: candidate_paths: set() duplicate: bool = False + def __repr__(self): + return "["+str(self.node.node) + str(self.paths)+"]" class DAGify: - def __init__(self, paths: List[Path], nodes = {}): + def __init__(self, paths: List[Path], nodes={}): """ :type paths: List[Path] """ self.paths = paths self.nodes = nodes - self.profile = [] def search_for_minimizing_replications(self) -> (List[Profile], int): - min_rep = len(self.nodes) + min_rep = sys.maxsize profile = [] for i, _ in enumerate(self.paths): profile_candidate = self.recursive_merge(i) - if min_rep > sum([x for x in profile if x]): - min_rep = sum([x for x in profile if x]) + if min_rep > len([x.duplicate for x in profile_candidate if x.duplicate]): + min_rep = len([x.duplicate for x in profile_candidate if x.duplicate]) profile = profile_candidate return profile, min_rep def recursive_merge(self, primary_path_index: int = 0) -> List[Profile]: profile = [] for node_index in self.paths[primary_path_index].nodes: - profile.append(Profile(node_index, [self.paths[primary_path_index]], {self.paths[primary_path_index].name}, 0)) + profile.append(Profile(node_index, [self.paths[primary_path_index]], {self.paths[primary_path_index].name}, False)) for i, path in enumerate(self.paths): if i == primary_path_index: continue @@ -81,7 +82,7 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: candidate_paths |= s1[i].candidate_paths if s1[i-1]: candidate_paths |= s1[i-1].candidate_paths - index.append(Profile(s2.nodes[j-1], [s2], candidate_paths, False)) + index.append(Profile(s2.nodes[j-1], [s2], candidate_paths, s2.nodes[j-1].node.index in prev)) prev.add(s2.nodes[j-1].node.index) j -= 1 @@ -98,15 +99,14 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: j -= 1 index.reverse() - self.profile = index return index - def to_graph(self): + def to_graph(self, profile: List[Profile]): factory_input = [] current_slice = Slice([]) # print(self.profile) - for prof in self.profile: + for prof in profile: paths = [x.name for x in prof.paths] if len(prof.paths) == len(prof.candidate_paths): if len(current_slice.nodes) > 0: diff --git a/src/test.py b/src/test.py index d366df6..e8a5061 100644 --- a/src/test.py +++ b/src/test.py @@ -56,11 +56,15 @@ def test_G(self): with self.assertRaises(ValueError): G([['C', {1, 2, 3, 4}], ['T', {12, 16}]]) - +a = 'a' +x = 'x' +y = 'y' +z = 'z' class DAGifyTest(unittest.TestCase): """ test class of sort.py """ + def test_dagify(self): gfa = GFA.load_from_gfa("../test/test.gfa") paths = gfa.to_paths @@ -69,31 +73,43 @@ def test_dagify(self): graph = dagify.to_graph() graph_by_toplogical_sort = gfa.to_graph - x = 'x' - y = 'y' - z = 'z' self.assertEqual(graph, graph_by_toplogical_sort) def test_dagify2(self): gfa = GFA.load_from_gfa("../test/test2.gfa") paths = gfa.to_paths dagify = DAGify(paths) - dagify.recursive_merge(0) - graph = dagify.to_graph() - - a = 'a' - x = 'x' - y = 'y' - z = 'z' + profile = dagify.recursive_merge(0) + graph = dagify.to_graph(profile) self.assertEqual(graph, [['CAAATAAG', {x, y, z}], ['G', {x}, 'A', {y, z}], ['C', {x, y}, 'T', {z}], ['TTG', {x, y, z}], ['G', {x, y}, 'A', {a, z}], ['AAATTTTCTGGAGTTCTAT', {a, x, y, z}], ['A', {a, z}, 'T', {x, y}], ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]]) def test_dagify3(self): gfa = GFA.load_from_gfa("../test/test3.gfa") paths = gfa.to_paths dagify = DAGify(paths) - dagify.recursive_merge(0) - graph = dagify.to_graph() - print(graph) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual(rep_count, 1) + self.assertEqual(graph, [['CAAATAAG', {x, y}], ['CCAACTCTCTG', {y}, 'G', {x}], ['C', {x, y}], ['TTG', {x, y}], ['G', {x, y}], ['AAATTTTCTGGAGTTCTAT', {x, y}], ['T', {x, y}], ['ATAT', {x, y}], ['T', {x, y}], ['CCAACTCTCTG', {x, y}]]) + + def test_dagify_altpath(self): + gfa = GFA.load_from_gfa("../test/alternate_paths.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual(rep_count, 1) + self.assertEqual(graph, [['CAAATAAG', {x, y}], ['A', {x}], ['G', {x, y}], ['A', {y}], ['T', {x, y}]]) + + def test_dagify_dup(self): + gfa = GFA.load_from_gfa("../test/duplicate.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual(rep_count, 2) + self.assertEqual(graph, [['CAAATAAG', {x, y}], ['', {x}, 'A', {y}], ['G', {y}], ['A', {x, y}], ['G', {x, y}], ['T', {x, y}]]) + class GFATest(unittest.TestCase): diff --git a/test/alternately_paths.gfa b/test/alternate_paths.gfa similarity index 100% rename from test/alternately_paths.gfa rename to test/alternate_paths.gfa From c9da4b99ea01ac82291c9d20db4697d87c11e505 Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Wed, 17 Jul 2019 22:44:34 +0900 Subject: [PATCH 06/14] #13: Fix class definition (WIP) --- src/gfa.py | 4 ++-- src/graph.py | 15 +++++++++---- src/sort.py | 59 +++++++++++++++++++++++++++++----------------------- src/test.py | 16 +++++++------- 4 files changed, 53 insertions(+), 41 deletions(-) diff --git a/src/gfa.py b/src/gfa.py index 0f35741..3195e73 100644 --- a/src/gfa.py +++ b/src/gfa.py @@ -131,7 +131,7 @@ def to_paths(self) -> List[Path]: for path in self.gfa.paths: nodes = [] for node in path.segment_names: - node_index = NodeIndex(Node(node_hash[node.name + node.orient].seq, [], node.name), node.orient) + node_index = NodeTraversal(Node(node_hash[node.name + node.orient].seq, [], node.name), node.orient) nodes.append(node_index) paths.append(Path(path.name, nodes)) @@ -196,7 +196,7 @@ def to_graph(self): else: current_slice.add_node(node_hash[node]) - base_graph = Graph.load_from_slices(factory_input) + base_graph = SlicedGraph.load_from_slices(factory_input, self.gfa.paths) return base_graph diff --git a/src/graph.py b/src/graph.py index 1a9f21d..1212995 100644 --- a/src/graph.py +++ b/src/graph.py @@ -125,9 +125,9 @@ class Path: was sequenced. A path visits a series of nodes and the ordered concatenation of the node sequences is the accession's genome. Create Paths first from accession names, then append them to Nodes to link together.""" - def __init__(self, accession: str): + def __init__(self, accession: str, nodes = []): self.accession = accession # one path per accessions - self.nodes = [] # List[NodeTraversal] + self.nodes = nodes # List[NodeTraversal] self.position_checkpoints = {} # TODO: currently not used def __getitem__(self, path_index): @@ -150,6 +150,9 @@ def append_node(self, node: Node, strand: str): node.paths.add(PathIndex(self, len(self.nodes)-1)) # already appended node return node + def name(self): + return self.accession + def to_gfa(self): return '\t'.join(['P', self.accession, "+,".join([x.node.name + x.strand for x in self.nodes]) + "+", ",".join(['*' for x in self.nodes])]) @@ -165,7 +168,7 @@ def __repr__(self): return repr(self.path.accession) def __eq__(self, other): - if self.path.accession == other.path.accession and self.index == other.index: + if self.path.accession == other.path.accession: # and self.index == other.index: return True else: return False @@ -174,7 +177,7 @@ def __lt__(self, other): return self.path.accession < other.path.accession def __hash__(self): - return hash(self.path.accession) * (self.index if self.index else 1) + return hash(self.path.accession) # * (self.index if self.index else 1) class NodeTraversal: @@ -186,6 +189,9 @@ def __init__(self, node: Node, strand: str = '+'): def __repr__(self): return self.node.seq + def __eq__(self, other): + return self.node.id == other.node.id + class Graph: def __init__(self, paths: Iterable = None): @@ -250,6 +256,7 @@ def __init__(self, paths): self.compute_slices() def __eq__(self, representation): + print(self,representation) if isinstance(representation, SlicedGraph): return all(slice_a == slice_b for slice_a, slice_b in zip_longest(self.slices, representation.slices)) return self == SlicedGraph.build(representation) # build a graph then compare it diff --git a/src/sort.py b/src/sort.py index 003b181..ad41c60 100644 --- a/src/sort.py +++ b/src/sort.py @@ -4,7 +4,7 @@ @dataclasses.dataclass class Profile: - node: NodeIndex + node: NodeTraversal paths: List[Path] candidate_paths: set() duplicate: bool = False @@ -34,7 +34,7 @@ def search_for_minimizing_replications(self) -> (List[Profile], int): def recursive_merge(self, primary_path_index: int = 0) -> List[Profile]: profile = [] for node_index in self.paths[primary_path_index].nodes: - profile.append(Profile(node_index, [self.paths[primary_path_index]], {self.paths[primary_path_index].name}, False)) + profile.append(Profile(node_index, [self.paths[primary_path_index]], {self.paths[primary_path_index]}, False)) for i, path in enumerate(self.paths): if i == primary_path_index: continue @@ -55,47 +55,49 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: index = [] prev = set() candidate_path_flag = False +# print(s1., s2.nodes) while i > 0 and j > 0: if s1[i-1].node == s2.nodes[j-1]: prev_paths = s1[i-1].paths prev_paths.append(s2) candidate_paths = s1[i-1].candidate_paths - candidate_paths.add(s2.name) + candidate_paths.add(s2) candidate_path_flag = True - index.append(Profile(s1[i-1].node, prev_paths, candidate_paths, s1[i-1].node.node.index in prev)) - prev.add(s1[i-1].node.node.index) + index.append(Profile(s1[i-1].node, prev_paths, candidate_paths, s1[i-1].node.node.id in prev)) + prev.add(s1[i-1].node.node.id) i -= 1 j -= 1 elif dp[i-1][j] > dp[i][j-1]: prev_paths = s1[i-1].paths candidate_paths = s1[i-1].candidate_paths if candidate_path_flag: - candidate_paths.add(s2.name) - index.append(Profile(s1[i-1].node, prev_paths, candidate_paths, s1[i-1].node.node.index in prev)) - prev.add(s1[i-1].node.node.index) + candidate_paths.add(s2) + index.append(Profile(s1[i-1].node, prev_paths, candidate_paths, s1[i-1].node.node.id in prev)) + prev.add(s1[i-1].node.node.id) i -= 1 else: - candidate_paths = {s2.name} + candidate_paths = {s2} if s1[i]: candidate_paths |= s1[i].candidate_paths if s1[i-1]: candidate_paths |= s1[i-1].candidate_paths - index.append(Profile(s2.nodes[j-1], [s2], candidate_paths, s2.nodes[j-1].node.index in prev)) - prev.add(s2.nodes[j-1].node.index) + index.append(Profile(s2.nodes[j-1], [s2], candidate_paths, s2.nodes[j-1].node.id in prev)) + prev.add(s2.nodes[j-1].node.id) j -= 1 while i > 0: prev_paths = s1[i - 1].paths prev_candidates = s1[i-1].candidate_paths - index.append(Profile(s1[i - 1].node, prev_paths, prev_candidates, s1[i - 1].node.node.index in prev)) - prev.add(s1[i - 1].node.node.index) + index.append(Profile(s1[i - 1].node, prev_paths, prev_candidates, s1[i - 1].node.node.id in prev)) + prev.add(s1[i - 1].node.node.id) i -= 1 while j > 0: - prev.add(s2.nodes[j - 1]) - index.append(Profile(s2.nodes[j - 1], [s2], {s2.name}, False)) + print(s2.nodes[j - 1], type(s2.nodes[j - 1])) + prev.add(s2.nodes[j - 1].node.id) + index.append(Profile(s2.nodes[j - 1], [s2], {s2}, False)) j -= 1 index.reverse() @@ -105,28 +107,33 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: def to_graph(self, profile: List[Profile]): factory_input = [] current_slice = Slice([]) - # print(self.profile) + current_paths = [] for prof in profile: - paths = [x.name for x in prof.paths] + paths = [x for x in prof.paths] if len(prof.paths) == len(prof.candidate_paths): if len(current_slice.nodes) > 0: factory_input.append(current_slice) - factory_input.append(Slice([Node(prof.node.node.seq, paths, prof.node.node.index)])) + factory_input.append(Slice([Node(prof.node.node.seq, paths, prof.node.node.id)])) current_slice = Slice([]) + current_paths = [] else: + all_path_set = set([x for x in current_paths]) all_set = set() - for items in [x.paths for x in current_slice.nodes]: - all_set |= items + for items in [x.paths for x in current_slice]: + items = set(items) #print(type(list(items)[0])) + all_set |= items # print(all_set, prof.candidate_paths, prof.paths, set([x.name for x in prof.paths]) & all_set) - if set([x.name for x in prof.paths]) & all_set != set(): + if set([x for x in prof.paths]) & all_path_set != set(): if len(current_slice.nodes) > 0: - if prof.candidate_paths - all_set != set(): - current_slice.add_node(Node("", prof.candidate_paths - all_set)) + if prof.candidate_paths - all_path_set != set(): + current_slice.add_node(Node("", prof.candidate_paths - all_path_set)) factory_input.append(current_slice) - current_slice = Slice([Node(prof.node.node.seq, paths, prof.node.node.index)]) + current_slice = Slice([Node(prof.node.node.seq, paths, prof.node.node.id)]) + current_paths = paths else: - current_slice.add_node(Node(prof.node.node.seq, paths, prof.node.node.index)) + current_slice.add_node(Node(prof.node.node.seq, paths, prof.node.node.id)) + current_paths.extend(paths) - base_graph = Graph.load_from_slices(factory_input) + base_graph = SlicedGraph.load_from_slices(factory_input, self.paths) # print(factory_input) return base_graph diff --git a/src/test.py b/src/test.py index c51e431..ae564de 100644 --- a/src/test.py +++ b/src/test.py @@ -107,11 +107,8 @@ def pf(wd, path): # Define several test example directories PATH_TO_TEST_DATA = pf(WD, "test/") +x,y,z,a = 'x', 'y', 'z', 'a' -a = 'a' -x = 'x' -y = 'y' -z = 'z' class DAGifyTest(unittest.TestCase): """ test class of sort.py """ @@ -121,11 +118,11 @@ def test_dagify(self): gfa = GFA.load_from_gfa("../test/test.gfa") paths = gfa.to_paths dagify = DAGify(paths) - dagify.recursive_merge(0) - graph = dagify.to_graph() + profile = dagify.recursive_merge(0) + graph = dagify.to_graph(profile) +# x, y, z = graph.paths['x'], graph.paths['y'], graph.paths['z'] - graph_by_toplogical_sort = gfa.to_graph - self.assertEqual(graph, graph_by_toplogical_sort) + self.assertEqual([['CAAATAAG', {x,y,z}], ['A', {y,z}, 'G', {x}], ['C', {x,y,z}], ['TTG', {x,y,z}], ['A', {z}, 'G', {x,y}], ['AAATTTTCTGGAGTTCTAT', {x,y,z}], ['T', {x,y,z}], ['ATAT', {x,y,z}], ['T', {x,y,z}], ['CCAACTCTCTG', {x,y,z}]], graph) def test_dagify2(self): gfa = GFA.load_from_gfa("../test/test2.gfa") @@ -133,7 +130,8 @@ def test_dagify2(self): dagify = DAGify(paths) profile = dagify.recursive_merge(0) graph = dagify.to_graph(profile) - self.assertEqual(graph, [['CAAATAAG', {x, y, z}], ['G', {x}, 'A', {y, z}], ['C', {x, y}, 'T', {z}], ['TTG', {x, y, z}], ['G', {x, y}, 'A', {a, z}], ['AAATTTTCTGGAGTTCTAT', {a, x, y, z}], ['A', {a, z}, 'T', {x, y}], ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]]) + x,y,z,a = 'x', 'y', 'z', 'a' + self.assertEqual([['CAAATAAG', {x, y, z}], ['G', {x}, 'A', {y, z}], ['C', {x, y}, 'T', {z}], ['TTG', {x, y, z}], ['G', {x, y}, 'A', {a, z}], ['AAATTTTCTGGAGTTCTAT', {a, x, y, z}], ['A', {a, z}, 'T', {x, y}], ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]], graph) def test_dagify3(self): gfa = GFA.load_from_gfa("../test/test3.gfa") From 6db2e7ce595fe3e6aac525da5d17f11e5a6e5e2d Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Thu, 18 Jul 2019 12:02:35 +0900 Subject: [PATCH 07/14] #13: Update compute_slices to use DAGify (WIP) --- src/graph.py | 15 ++++++++++++++- src/test.py | 11 +++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/graph.py b/src/graph.py index 1212995..fa526b7 100644 --- a/src/graph.py +++ b/src/graph.py @@ -246,6 +246,9 @@ def compute_slices(self): return SlicedGraph.from_graph(self) +from sort import DAGify + + class SlicedGraph(Graph): def __init__(self, paths): super(SlicedGraph, self).__init__(paths) @@ -256,7 +259,6 @@ def __init__(self, paths): self.compute_slices() def __eq__(self, representation): - print(self,representation) if isinstance(representation, SlicedGraph): return all(slice_a == slice_b for slice_a, slice_b in zip_longest(self.slices, representation.slices)) return self == SlicedGraph.build(representation) # build a graph then compare it @@ -286,6 +288,16 @@ def compute_slices(self): self.slices.append(Slice([node])) return self + def compute_slices_by_dagify(self): + """This method uses DAGify algorithm to compute slices.""" + if not self.paths: + return self + dagify = DAGify(self.paths) + profile = dagify.recursive_merge(0) + graph = dagify.to_graph(profile) + self.slices = graph.slices + return self + @staticmethod def build(cmd): """This factory uses existing slice declarations to build a graph with Paths populated in the order @@ -296,6 +308,7 @@ def build(cmd): # preemptively grab all the path names from every odd list entry paths = {key for sl in cmd for i in range(0, len(sl), 2) for key in sl[i + 1]} graph = SlicedGraph(paths) + graph.slices = [] for sl in cmd: current_slice = [] if isinstance(sl, Slice): diff --git a/src/test.py b/src/test.py index ae564de..9dee219 100644 --- a/src/test.py +++ b/src/test.py @@ -192,6 +192,17 @@ def test_gfa_to_sliced_graph(self): print(slices) self.assertEqual(slices, [['CAAATAAG', {x, y, z}], ['A', {y, z}, 'G', {x}], ['C', {x, y, z}], ['TTG', {x, y, z}], ['A', {z}, 'G', {x, y}], ['AAATTTTCTGGAGTTCTAT', {x, y, z}], ['T', {x, y, z}], ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]]) + def test_gfa_to_sliced_graph_via_dagify(self): + #TODO: this is currently close but not quite there. + # Slices must be fully defined in SlicedGraph.compute_slices() + graph, gfa = self.make_graph_from_gfa() + slices = SlicedGraph.from_graph(graph) + x = 'x' + y = 'y' + z = 'z' + print(slices) + self.assertEqual(slices, [['CAAATAAG', {x, y, z}], ['A', {y, z}, 'G', {x}], ['C', {x, y, z}], ['TTG', {x, y, z}], ['A', {z}, 'G', {x, y}], ['AAATTTTCTGGAGTTCTAT', {x, y, z}], ['T', {x, y, z}], ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]]) + def make_graph_from_gfa(self): gfa = GFA.load_from_gfa(PATH_TO_TEST_DATA + "test.gfa") graph = gfa.to_graph From 47e760404bc9723bb3d12c250bd9039e2fc9496b Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Thu, 18 Jul 2019 15:58:31 +0900 Subject: [PATCH 08/14] #13: Refactoring --- src/gfa.py | 50 +------------------------------------------------- src/graph.py | 6 +++--- src/sort.py | 11 +++++------ 3 files changed, 9 insertions(+), 58 deletions(-) diff --git a/src/gfa.py b/src/gfa.py index 3195e73..7bd7fea 100644 --- a/src/gfa.py +++ b/src/gfa.py @@ -147,58 +147,10 @@ def to_graph(self): graph.append_node_to_path(node.name, node.orient, path.name) for segment in self.gfa.segments: graph.nodes[segment.name].seq = segment.sequence + graph.paths = self.to_paths return graph # IMPORTANT: It's not clear to Josiah how much of the below is necessary, so it's being left unmodified. - topological_sort_helper = TopologicalSort() - path_dict = defaultdict(list) - node_hash = {} - - # Extract all paths into graph - for path in self.gfa.paths: - for node in path.segment_names: - path_dict[node.name + node.orient].append(path.name) - for node_pair in pairwise(path.segment_names): - topological_sort_helper.add_edge( - node_pair[0].name + node_pair[0].orient, - node_pair[1].name + node_pair[1].orient) - - # Extract all nodes in the graph. - for segment in self.gfa.segments: - node_id = segment.name + "+" - node = Node(segment.sequence, path_dict[node_id]) - node_hash[node_id] = node - - node_id = segment.name + "-" - node = Node(segment.sequence, path_dict[node_id]) - node_hash[node_id] = node - - node_stack = topological_sort_helper.topologicalSort() - - # Cluster nodes as multiple slices according to the result of the topological sort. - factory_input = [] - current_slice = Slice([]) - for node in node_stack: - if len(path_dict[node]) == len(self.gfa.paths): - if len(current_slice.nodes) > 0: - factory_input.append(current_slice) - factory_input.append(Slice([node_hash[node]])) - current_slice = Slice([]) - else: - all_set = set() - for items in [x.paths for x in current_slice.nodes]: - all_set = all_set | items - if set(path_dict[node]) & all_set != set(): - if len(current_slice.nodes) > 0: - current_slice.add_node(Node("", set([x.name for x in self.gfa.paths]) - all_set)) - factory_input.append(current_slice) - current_slice = Slice([node_hash[node]]) - else: - current_slice.add_node(node_hash[node]) - - base_graph = SlicedGraph.load_from_slices(factory_input, self.gfa.paths) - return base_graph - ''' class XGWrapper: diff --git a/src/graph.py b/src/graph.py index fa526b7..7657c29 100644 --- a/src/graph.py +++ b/src/graph.py @@ -275,7 +275,7 @@ def from_graph(graph): g = SlicedGraph([]) g.paths = graph.paths # shallow copy all relevant fields g.nodes = graph.nodes - g.compute_slices() + g.compute_slices_by_dagify() return g def compute_slices(self): @@ -294,8 +294,8 @@ def compute_slices_by_dagify(self): return self dagify = DAGify(self.paths) profile = dagify.recursive_merge(0) - graph = dagify.to_graph(profile) - self.slices = graph.slices + slices = dagify.to_slices(profile) + self.slices = slices return self @staticmethod diff --git a/src/sort.py b/src/sort.py index ad41c60..fdd3a09 100644 --- a/src/sort.py +++ b/src/sort.py @@ -104,7 +104,7 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: return index - def to_graph(self, profile: List[Profile]): + def to_slices(self, profile: List[Profile]): factory_input = [] current_slice = Slice([]) current_paths = [] @@ -118,11 +118,6 @@ def to_graph(self, profile: List[Profile]): current_paths = [] else: all_path_set = set([x for x in current_paths]) - all_set = set() - for items in [x.paths for x in current_slice]: - items = set(items) #print(type(list(items)[0])) - all_set |= items - # print(all_set, prof.candidate_paths, prof.paths, set([x.name for x in prof.paths]) & all_set) if set([x for x in prof.paths]) & all_path_set != set(): if len(current_slice.nodes) > 0: if prof.candidate_paths - all_path_set != set(): @@ -133,7 +128,11 @@ def to_graph(self, profile: List[Profile]): else: current_slice.add_node(Node(prof.node.node.seq, paths, prof.node.node.id)) current_paths.extend(paths) + return factory_input + + def to_graph(self, profile: List[Profile]): + factory_input = self.to_slices(profile) base_graph = SlicedGraph.load_from_slices(factory_input, self.paths) # print(factory_input) return base_graph From 33370e94cddfffb573502b048184b1bb582a113e Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Thu, 18 Jul 2019 16:16:52 +0900 Subject: [PATCH 09/14] #13: Hotfix --- src/gfa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gfa.py b/src/gfa.py index 7bd7fea..738d7fb 100644 --- a/src/gfa.py +++ b/src/gfa.py @@ -108,7 +108,7 @@ def save_as_gfa(self, file: str): def from_graph(cls, graph: Graph): """Constructs the lines of a GFA file listing paths, then sequence nodes in arbitrary order.""" gfa = gfapy.Gfa() - for path in graph.paths.values(): + for path in graph.paths: node_series = ",".join([traverse.node.id + traverse.strand for traverse in path.nodes]) gfa.add_line('\t'.join(['P', path.accession, node_series, ",".join(['*' for _ in path.nodes])])) for node in graph.nodes.values(): # in no particular order From c14eb81aacafd445e30edce2e378c954a34f1be7 Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Thu, 18 Jul 2019 17:28:10 +0900 Subject: [PATCH 10/14] #13: Update tests --- src/sort.py | 17 +++++++++-------- src/test.py | 38 ++++++++++++++++++++++++++++++++++++-- test/inversion.gfa | 3 ++- test/unresolved_repeat.gfa | 2 +- 4 files changed, 48 insertions(+), 12 deletions(-) diff --git a/src/sort.py b/src/sort.py index fdd3a09..d480447 100644 --- a/src/sort.py +++ b/src/sort.py @@ -10,7 +10,7 @@ class Profile: duplicate: bool = False def __repr__(self): - return "["+str(self.node.node) + str(self.paths)+"]" + return "["+str(self.node.node) + str(self.paths)+":"+str(self.candidate_paths) +"]" class DAGify: def __init__(self, paths: List[Path], nodes={}): @@ -55,7 +55,6 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: index = [] prev = set() candidate_path_flag = False -# print(s1., s2.nodes) while i > 0 and j > 0: if s1[i-1].node == s2.nodes[j-1]: @@ -79,7 +78,7 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: i -= 1 else: candidate_paths = {s2} - if s1[i]: + if i > n and s1[i]: candidate_paths |= s1[i].candidate_paths if s1[i-1]: candidate_paths |= s1[i-1].candidate_paths @@ -95,29 +94,33 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: i -= 1 while j > 0: - print(s2.nodes[j - 1], type(s2.nodes[j - 1])) +# print(s2.nodes[j - 1], type(s2.nodes[j - 1])) prev.add(s2.nodes[j - 1].node.id) index.append(Profile(s2.nodes[j - 1], [s2], {s2}, False)) j -= 1 index.reverse() + # print(index) return index - def to_slices(self, profile: List[Profile]): + def to_slices(self, profile: List[Profile]) -> List[Path]: factory_input = [] current_slice = Slice([]) current_paths = [] for prof in profile: paths = [x for x in prof.paths] + all_path_set = set([x for x in current_paths]) + # print(prof, current_slice, current_paths) if len(prof.paths) == len(prof.candidate_paths): if len(current_slice.nodes) > 0: + if prof.candidate_paths - all_path_set != set(): + current_slice.add_node(Node("", prof.candidate_paths - all_path_set)) factory_input.append(current_slice) factory_input.append(Slice([Node(prof.node.node.seq, paths, prof.node.node.id)])) current_slice = Slice([]) current_paths = [] else: - all_path_set = set([x for x in current_paths]) if set([x for x in prof.paths]) & all_path_set != set(): if len(current_slice.nodes) > 0: if prof.candidate_paths - all_path_set != set(): @@ -130,9 +133,7 @@ def to_slices(self, profile: List[Profile]): current_paths.extend(paths) return factory_input - def to_graph(self, profile: List[Profile]): factory_input = self.to_slices(profile) base_graph = SlicedGraph.load_from_slices(factory_input, self.paths) - # print(factory_input) return base_graph diff --git a/src/test.py b/src/test.py index 9dee219..db60bb3 100644 --- a/src/test.py +++ b/src/test.py @@ -149,7 +149,7 @@ def test_dagify_altpath(self): profile, rep_count = dagify.search_for_minimizing_replications() graph = dagify.to_graph(profile) self.assertEqual(rep_count, 1) - self.assertEqual(graph, [['CAAATAAG', {x, y}], ['A', {x}], ['G', {x, y}], ['A', {y}], ['T', {x, y}]]) + self.assertEqual(graph, [['CAAATAAG', {x, y}], ['A', {x}, '', {y}], ['G', {x, y}], ['A', {y}, '', {x}], ['T', {x, y}]]) def test_dagify_dup(self): gfa = GFA.load_from_gfa("../test/duplicate.gfa") @@ -158,9 +158,43 @@ def test_dagify_dup(self): profile, rep_count = dagify.search_for_minimizing_replications() graph = dagify.to_graph(profile) self.assertEqual(rep_count, 2) - self.assertEqual(graph, [['CAAATAAG', {x, y}], ['', {x}, 'A', {y}], ['G', {y}], ['A', {x, y}], ['G', {x, y}], ['T', {x, y}]]) + self.assertEqual(graph, [['CAAATAAG', {x, y}], ['', {x}, 'A', {y}], ['', {x}, 'G', {y}], ['A', {x, y}], ['G', {x, y}], ['T', {x, y}]]) + def test_unresolved_repreat(self): + gfa = GFA.load_from_gfa("../test/unresolved_repeat.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual([['CAAATAAG', {'x'}, 'T', {'y'}], ['A', {'y', 'x'}], ['G', {'x'}, 'C', {'y'}]], graph) + + @unittest.skip("Inversion is unsupported") + def test_inversion(self): + gfa = GFA.load_from_gfa("../test/inversion.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual(graph, []) + + @unittest.skip("Inversion is unsupported") + def test_nested_inversion(self): + gfa = GFA.load_from_gfa("../test/nested_inv.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual(graph, []) + + def test_simple_inversion(self): + gfa = GFA.load_from_gfa("../test/simple_inv.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) +# self.assertEqual(graph, [['CAAATAAG', {x,y}], ['A', {x,y}], ['G', {x, y}]]) + self.assertEqual(graph, [['CAAATAAG', {x,y}], ['A', {x}, 'A', {y}], ['G', {x, y}]]) class GFATest(unittest.TestCase): """ test class of gfa.py diff --git a/test/inversion.gfa b/test/inversion.gfa index d5a9e92..4572a75 100644 --- a/test/inversion.gfa +++ b/test/inversion.gfa @@ -7,4 +7,5 @@ L 1 + 3 - 0M S 2 A L 2 + 3 + 0M L 3 + 4 + 0M -S 3 G \ No newline at end of file +S 3 G +S 4 T \ No newline at end of file diff --git a/test/unresolved_repeat.gfa b/test/unresolved_repeat.gfa index 1e18616..6cd4e7c 100644 --- a/test/unresolved_repeat.gfa +++ b/test/unresolved_repeat.gfa @@ -1,6 +1,6 @@ H VN:Z:1.0 P x 1+,2+,3+ *,* -P y 4+,2+,5+ *,*,*,*,*,*,*,*,* +P y 4+,2+,5+ *,* S 1 CAAATAAG L 1 + 2 + 0M L 4 + 2 + 0M From f3e8cada93e9ebed94bc6b2a658ff7767152f34f Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Thu, 18 Jul 2019 17:56:41 +0900 Subject: [PATCH 11/14] #13: Update tests --- src/sort.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/sort.py b/src/sort.py index d480447..6ac7480 100644 --- a/src/sort.py +++ b/src/sort.py @@ -100,7 +100,7 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: j -= 1 index.reverse() - # print(index) + print(index) return index @@ -108,11 +108,16 @@ def to_slices(self, profile: List[Profile]) -> List[Path]: factory_input = [] current_slice = Slice([]) current_paths = [] - for prof in profile: + # print(profile) + for index, prof in enumerate(profile): paths = [x for x in prof.paths] all_path_set = set([x for x in current_paths]) # print(prof, current_slice, current_paths) - if len(prof.paths) == len(prof.candidate_paths): + candidate_paths_set = prof.candidate_paths + if index + 1 != len(profile): + candidate_paths_set |= profile[index+1].candidate_paths + + if len(prof.paths) == len(candidate_paths_set): if len(current_slice.nodes) > 0: if prof.candidate_paths - all_path_set != set(): current_slice.add_node(Node("", prof.candidate_paths - all_path_set)) @@ -131,6 +136,12 @@ def to_slices(self, profile: List[Profile]) -> List[Path]: else: current_slice.add_node(Node(prof.node.node.seq, paths, prof.node.node.id)) current_paths.extend(paths) + + if len(current_slice.nodes) > 0: + all_path_set = set([x for x in current_paths]) + if profile[-1].candidate_paths - all_path_set != set(): + current_slice.add_node(Node("", prof.candidate_paths - all_path_set)) + factory_input.append(current_slice) return factory_input def to_graph(self, profile: List[Profile]): From 8a549285106aa95a7524d56451400e85a173beaa Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Thu, 18 Jul 2019 20:36:06 +0900 Subject: [PATCH 12/14] #13: Fix tests --- src/sort.py | 4 +--- src/test.py | 10 +++++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/sort.py b/src/sort.py index 6ac7480..2636e2a 100644 --- a/src/sort.py +++ b/src/sort.py @@ -94,13 +94,11 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: i -= 1 while j > 0: -# print(s2.nodes[j - 1], type(s2.nodes[j - 1])) prev.add(s2.nodes[j - 1].node.id) index.append(Profile(s2.nodes[j - 1], [s2], {s2}, False)) j -= 1 index.reverse() - print(index) return index @@ -108,7 +106,7 @@ def to_slices(self, profile: List[Profile]) -> List[Path]: factory_input = [] current_slice = Slice([]) current_paths = [] - # print(profile) + for index, prof in enumerate(profile): paths = [x for x in prof.paths] all_path_set = set([x for x in current_paths]) diff --git a/src/test.py b/src/test.py index db60bb3..6476193 100644 --- a/src/test.py +++ b/src/test.py @@ -187,6 +187,7 @@ def test_nested_inversion(self): graph = dagify.to_graph(profile) self.assertEqual(graph, []) + @unittest.skip("Inversion is unsupported") def test_simple_inversion(self): gfa = GFA.load_from_gfa("../test/simple_inv.gfa") paths = gfa.to_paths @@ -196,14 +197,17 @@ def test_simple_inversion(self): # self.assertEqual(graph, [['CAAATAAG', {x,y}], ['A', {x,y}], ['G', {x, y}]]) self.assertEqual(graph, [['CAAATAAG', {x,y}], ['A', {x}, 'A', {y}], ['G', {x, y}]]) + +location_of_xg = "../test/xg" + + class GFATest(unittest.TestCase): """ test class of gfa.py """ - @unittest.expectedFailure + @unittest.skipIf(not os.path.isfile(location_of_xg), "XG binary is not found.") def test_gfa(self): self.maxDiff = None - location_of_xg = "../test/xg" graph = GFA.load_from_gfa("../test/test.gfa") graph.save_as_xg("../test/test.xg", location_of_xg) graph2 = GFA.load_from_xg("../test/test.xg", location_of_xg) @@ -254,11 +258,11 @@ def test_load_gfa_to_graph_2(self): @unittest.expectedFailure def test_load_gfa_via_xg(self): - location_of_xg = "../test/xg" graph = GFA.load_from_gfa("../test/test.gfa") graph.save_as_xg("../test/test.xg", location_of_xg) graph2 = GFA.load_from_xg("../test/test.xg", location_of_xg) graph = graph2.to_graph + graph = SlicedGraph.from_graph(graph) x = 'x' y = 'y' z = 'z' From aee4823434275af8b17970419f336604efb308e2 Mon Sep 17 00:00:00 2001 From: Toshiyuki Yokoyama Date: Thu, 18 Jul 2019 20:53:22 +0900 Subject: [PATCH 13/14] #13: Fix tests --- src/graph.py | 9 +++++++-- src/test.py | 3 +-- test/simple_inv.gfa | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/graph.py b/src/graph.py index 7657c29..31e22e9 100644 --- a/src/graph.py +++ b/src/graph.py @@ -187,10 +187,15 @@ def __init__(self, node: Node, strand: str = '+'): self.strand = strand # TODO: make this required def __repr__(self): - return self.node.seq + if self.strand == '+': + return self.node.seq + else: + complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} + return "".join(complement.get(base, base) for base in reversed(self.node.seq)) + def __eq__(self, other): - return self.node.id == other.node.id + return self.node.id == other.node.id and self.strand == other.strand class Graph: diff --git a/src/test.py b/src/test.py index 6476193..77e958a 100644 --- a/src/test.py +++ b/src/test.py @@ -194,8 +194,7 @@ def test_simple_inversion(self): dagify = DAGify(paths) profile, rep_count = dagify.search_for_minimizing_replications() graph = dagify.to_graph(profile) -# self.assertEqual(graph, [['CAAATAAG', {x,y}], ['A', {x,y}], ['G', {x, y}]]) - self.assertEqual(graph, [['CAAATAAG', {x,y}], ['A', {x}, 'A', {y}], ['G', {x, y}]]) + self.assertEqual(graph, [['CAAATAAG', {x,y}], ['AC', {x}, 'AC', {y}], ['G', {x, y}]]) location_of_xg = "../test/xg" diff --git a/test/simple_inv.gfa b/test/simple_inv.gfa index de62c6b..9d8d152 100644 --- a/test/simple_inv.gfa +++ b/test/simple_inv.gfa @@ -4,7 +4,7 @@ P y 1+,2-,3+ *,* S 1 CAAATAAG L 1 + 2 + 0M L 1 + 2 - 0M -S 2 A +S 2 AC L 2 - 3 + 0M L 2 + 3 + 0M S 3 G \ No newline at end of file From 0dc052855d6d1d34262bab3b2721f54e196fdff5 Mon Sep 17 00:00:00 2001 From: Josiah Seaman Date: Fri, 19 Jul 2019 10:35:59 +0100 Subject: [PATCH 14/14] Cleaned up sort imports for code review --- src/graph.py | 5 ++--- src/sort.py | 19 ++++++++++++------- src/test.py | 2 -- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/graph.py b/src/graph.py index 31e22e9..ffda621 100644 --- a/src/graph.py +++ b/src/graph.py @@ -251,9 +251,6 @@ def compute_slices(self): return SlicedGraph.from_graph(self) -from sort import DAGify - - class SlicedGraph(Graph): def __init__(self, paths): super(SlicedGraph, self).__init__(paths) @@ -295,6 +292,8 @@ def compute_slices(self): def compute_slices_by_dagify(self): """This method uses DAGify algorithm to compute slices.""" + from src.sort import DAGify # help avoid circular import + if not self.paths: return self dagify = DAGify(self.paths) diff --git a/src/sort.py b/src/sort.py index 2636e2a..946c42e 100644 --- a/src/sort.py +++ b/src/sort.py @@ -1,6 +1,9 @@ -from src.graph import * - +import sys import dataclasses +from typing import List + +from src.graph import NodeTraversal, Path, Slice, Node, SlicedGraph + @dataclasses.dataclass class Profile: @@ -13,11 +16,12 @@ def __repr__(self): return "["+str(self.node.node) + str(self.paths)+":"+str(self.candidate_paths) +"]" class DAGify: - def __init__(self, paths: List[Path], nodes={}): + def __init__(self, paths: List[Path], nodes=None): """ - :type paths: List[Path] """ + if nodes is None: + nodes = {} self.paths = paths self.nodes = nodes @@ -102,7 +106,7 @@ def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: return index - def to_slices(self, profile: List[Profile]) -> List[Path]: + def to_slices(self, profile: List[Profile]) -> List[Slice]: factory_input = [] current_slice = Slice([]) current_paths = [] @@ -138,11 +142,12 @@ def to_slices(self, profile: List[Profile]) -> List[Path]: if len(current_slice.nodes) > 0: all_path_set = set([x for x in current_paths]) if profile[-1].candidate_paths - all_path_set != set(): + print(prof) current_slice.add_node(Node("", prof.candidate_paths - all_path_set)) factory_input.append(current_slice) return factory_input - def to_graph(self, profile: List[Profile]): - factory_input = self.to_slices(profile) + def to_graph(self, profiles: List[Profile]): + factory_input = self.to_slices(profiles) base_graph = SlicedGraph.load_from_slices(factory_input, self.paths) return base_graph diff --git a/src/test.py b/src/test.py index 77e958a..ec30b6c 100644 --- a/src/test.py +++ b/src/test.py @@ -219,8 +219,6 @@ def test_load_gfa_to_graph(self): self.assertEqual(len(graph.nodes), 15) def test_gfa_to_sliced_graph(self): - #TODO: this is currently close but not quite there. - # Slices must be fully defined in SlicedGraph.compute_slices() graph, gfa = self.make_graph_from_gfa() slices = SlicedGraph.from_graph(graph) x = 'x'