diff --git a/src/gfa.py b/src/gfa.py index 661e626..738d7fb 100644 --- a/src/gfa.py +++ b/src/gfa.py @@ -108,13 +108,35 @@ def save_as_gfa(self, file: str): def from_graph(cls, graph: Graph): """Constructs the lines of a GFA file listing paths, then sequence nodes in arbitrary order.""" gfa = gfapy.Gfa() - for path in graph.paths.values(): + for path in graph.paths: node_series = ",".join([traverse.node.id + traverse.strand for traverse in path.nodes]) gfa.add_line('\t'.join(['P', path.accession, node_series, ",".join(['*' for _ in path.nodes])])) for node in graph.nodes.values(): # in no particular order gfa.add_line('\t'.join(['S', str(node.id), node.seq])) return cls(gfa) + @property + def to_paths(self) -> List[Path]: + node_hash = {} + for segment in self.gfa.segments: + node_id = segment.name + "+" + node = Node(segment.sequence, []) + node_hash[node_id] = node + + node_id = segment.name + "-" + node = Node(segment.sequence, []) + node_hash[node_id] = node + + paths = [] + for path in self.gfa.paths: + nodes = [] + for node in path.segment_names: + node_index = NodeTraversal(Node(node_hash[node.name + node.orient].seq, [], node.name), node.orient) + nodes.append(node_index) + paths.append(Path(path.name, nodes)) + + return paths + @property def to_graph(self): # Extract all paths into graph @@ -125,58 +147,10 @@ def to_graph(self): graph.append_node_to_path(node.name, node.orient, path.name) for segment in self.gfa.segments: graph.nodes[segment.name].seq = segment.sequence + graph.paths = self.to_paths return graph # IMPORTANT: It's not clear to Josiah how much of the below is necessary, so it's being left unmodified. - topological_sort_helper = TopologicalSort() - path_dict = defaultdict(list) - node_hash = {} - - # Extract all paths into graph - for path in self.gfa.paths: - for node in path.segment_names: - path_dict[node.name + node.orient].append(path.name) - for node_pair in pairwise(path.segment_names): - topological_sort_helper.add_edge( - node_pair[0].name + node_pair[0].orient, - node_pair[1].name + node_pair[1].orient) - - # Extract all nodes in the graph. - for segment in self.gfa.segments: - node_id = segment.name + "+" - node = Node(segment.sequence, path_dict[node_id]) - node_hash[node_id] = node - - node_id = segment.name + "-" - node = Node(segment.sequence, path_dict[node_id]) - node_hash[node_id] = node - - node_stack = topological_sort_helper.topologicalSort() - - # Cluster nodes as multiple slices according to the result of the topological sort. - factory_input = [] - current_slice = Slice([]) - for node in node_stack: - if len(path_dict[node]) == len(self.gfa.paths): - if len(current_slice.nodes) > 0: - factory_input.append(current_slice) - factory_input.append(Slice([node_hash[node]])) - current_slice = Slice([]) - else: - all_set = set() - for items in [x.paths for x in current_slice.nodes]: - all_set = all_set | items - if set(path_dict[node]) & all_set != set(): - if len(current_slice.nodes) > 0: - current_slice.add_node(Node("", set([x.name for x in self.gfa.paths]) - all_set)) - factory_input.append(current_slice) - current_slice = Slice([node_hash[node]]) - else: - current_slice.add_node(node_hash[node]) - - base_graph = Graph.load_from_slices(factory_input) - return base_graph - ''' class XGWrapper: diff --git a/src/graph.py b/src/graph.py index 1a9f21d..ffda621 100644 --- a/src/graph.py +++ b/src/graph.py @@ -125,9 +125,9 @@ class Path: was sequenced. A path visits a series of nodes and the ordered concatenation of the node sequences is the accession's genome. Create Paths first from accession names, then append them to Nodes to link together.""" - def __init__(self, accession: str): + def __init__(self, accession: str, nodes = []): self.accession = accession # one path per accessions - self.nodes = [] # List[NodeTraversal] + self.nodes = nodes # List[NodeTraversal] self.position_checkpoints = {} # TODO: currently not used def __getitem__(self, path_index): @@ -150,6 +150,9 @@ def append_node(self, node: Node, strand: str): node.paths.add(PathIndex(self, len(self.nodes)-1)) # already appended node return node + def name(self): + return self.accession + def to_gfa(self): return '\t'.join(['P', self.accession, "+,".join([x.node.name + x.strand for x in self.nodes]) + "+", ",".join(['*' for x in self.nodes])]) @@ -165,7 +168,7 @@ def __repr__(self): return repr(self.path.accession) def __eq__(self, other): - if self.path.accession == other.path.accession and self.index == other.index: + if self.path.accession == other.path.accession: # and self.index == other.index: return True else: return False @@ -174,7 +177,7 @@ def __lt__(self, other): return self.path.accession < other.path.accession def __hash__(self): - return hash(self.path.accession) * (self.index if self.index else 1) + return hash(self.path.accession) # * (self.index if self.index else 1) class NodeTraversal: @@ -184,7 +187,15 @@ def __init__(self, node: Node, strand: str = '+'): self.strand = strand # TODO: make this required def __repr__(self): - return self.node.seq + if self.strand == '+': + return self.node.seq + else: + complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} + return "".join(complement.get(base, base) for base in reversed(self.node.seq)) + + + def __eq__(self, other): + return self.node.id == other.node.id and self.strand == other.strand class Graph: @@ -266,7 +277,7 @@ def from_graph(graph): g = SlicedGraph([]) g.paths = graph.paths # shallow copy all relevant fields g.nodes = graph.nodes - g.compute_slices() + g.compute_slices_by_dagify() return g def compute_slices(self): @@ -279,6 +290,18 @@ def compute_slices(self): self.slices.append(Slice([node])) return self + def compute_slices_by_dagify(self): + """This method uses DAGify algorithm to compute slices.""" + from src.sort import DAGify # help avoid circular import + + if not self.paths: + return self + dagify = DAGify(self.paths) + profile = dagify.recursive_merge(0) + slices = dagify.to_slices(profile) + self.slices = slices + return self + @staticmethod def build(cmd): """This factory uses existing slice declarations to build a graph with Paths populated in the order @@ -289,6 +312,7 @@ def build(cmd): # preemptively grab all the path names from every odd list entry paths = {key for sl in cmd for i in range(0, len(sl), 2) for key in sl[i + 1]} graph = SlicedGraph(paths) + graph.slices = [] for sl in cmd: current_slice = [] if isinstance(sl, Slice): diff --git a/src/sort.py b/src/sort.py new file mode 100644 index 0000000..946c42e --- /dev/null +++ b/src/sort.py @@ -0,0 +1,153 @@ +import sys +import dataclasses +from typing import List + +from src.graph import NodeTraversal, Path, Slice, Node, SlicedGraph + + +@dataclasses.dataclass +class Profile: + node: NodeTraversal + paths: List[Path] + candidate_paths: set() + duplicate: bool = False + + def __repr__(self): + return "["+str(self.node.node) + str(self.paths)+":"+str(self.candidate_paths) +"]" + +class DAGify: + def __init__(self, paths: List[Path], nodes=None): + """ + :type paths: List[Path] + """ + if nodes is None: + nodes = {} + self.paths = paths + self.nodes = nodes + + def search_for_minimizing_replications(self) -> (List[Profile], int): + min_rep = sys.maxsize + profile = [] + for i, _ in enumerate(self.paths): + profile_candidate = self.recursive_merge(i) + if min_rep > len([x.duplicate for x in profile_candidate if x.duplicate]): + min_rep = len([x.duplicate for x in profile_candidate if x.duplicate]) + profile = profile_candidate + return profile, min_rep + + def recursive_merge(self, primary_path_index: int = 0) -> List[Profile]: + profile = [] + for node_index in self.paths[primary_path_index].nodes: + profile.append(Profile(node_index, [self.paths[primary_path_index]], {self.paths[primary_path_index]}, False)) + for i, path in enumerate(self.paths): + if i == primary_path_index: + continue + profile = self.lcs(profile, path) + return profile + + def lcs(self, s1: List[Profile], s2: Path) -> List[Profile]: + n, m = len(s1), len(s2.nodes) + dp = [[0] * (m+1) for _ in range(n+1)] + + for i in range(1, n + 1): + for j in range(1, m + 1): + if s1[i-1].node == s2.nodes[j-1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + i, j = n, m + index = [] + prev = set() + candidate_path_flag = False + + while i > 0 and j > 0: + if s1[i-1].node == s2.nodes[j-1]: + prev_paths = s1[i-1].paths + prev_paths.append(s2) + candidate_paths = s1[i-1].candidate_paths + candidate_paths.add(s2) + candidate_path_flag = True + + index.append(Profile(s1[i-1].node, prev_paths, candidate_paths, s1[i-1].node.node.id in prev)) + prev.add(s1[i-1].node.node.id) + i -= 1 + j -= 1 + elif dp[i-1][j] > dp[i][j-1]: + prev_paths = s1[i-1].paths + candidate_paths = s1[i-1].candidate_paths + if candidate_path_flag: + candidate_paths.add(s2) + index.append(Profile(s1[i-1].node, prev_paths, candidate_paths, s1[i-1].node.node.id in prev)) + prev.add(s1[i-1].node.node.id) + i -= 1 + else: + candidate_paths = {s2} + if i > n and s1[i]: + candidate_paths |= s1[i].candidate_paths + if s1[i-1]: + candidate_paths |= s1[i-1].candidate_paths + index.append(Profile(s2.nodes[j-1], [s2], candidate_paths, s2.nodes[j-1].node.id in prev)) + prev.add(s2.nodes[j-1].node.id) + j -= 1 + + while i > 0: + prev_paths = s1[i - 1].paths + prev_candidates = s1[i-1].candidate_paths + index.append(Profile(s1[i - 1].node, prev_paths, prev_candidates, s1[i - 1].node.node.id in prev)) + prev.add(s1[i - 1].node.node.id) + i -= 1 + + while j > 0: + prev.add(s2.nodes[j - 1].node.id) + index.append(Profile(s2.nodes[j - 1], [s2], {s2}, False)) + j -= 1 + + index.reverse() + + return index + + def to_slices(self, profile: List[Profile]) -> List[Slice]: + factory_input = [] + current_slice = Slice([]) + current_paths = [] + + for index, prof in enumerate(profile): + paths = [x for x in prof.paths] + all_path_set = set([x for x in current_paths]) + # print(prof, current_slice, current_paths) + candidate_paths_set = prof.candidate_paths + if index + 1 != len(profile): + candidate_paths_set |= profile[index+1].candidate_paths + + if len(prof.paths) == len(candidate_paths_set): + if len(current_slice.nodes) > 0: + if prof.candidate_paths - all_path_set != set(): + current_slice.add_node(Node("", prof.candidate_paths - all_path_set)) + factory_input.append(current_slice) + factory_input.append(Slice([Node(prof.node.node.seq, paths, prof.node.node.id)])) + current_slice = Slice([]) + current_paths = [] + else: + if set([x for x in prof.paths]) & all_path_set != set(): + if len(current_slice.nodes) > 0: + if prof.candidate_paths - all_path_set != set(): + current_slice.add_node(Node("", prof.candidate_paths - all_path_set)) + factory_input.append(current_slice) + current_slice = Slice([Node(prof.node.node.seq, paths, prof.node.node.id)]) + current_paths = paths + else: + current_slice.add_node(Node(prof.node.node.seq, paths, prof.node.node.id)) + current_paths.extend(paths) + + if len(current_slice.nodes) > 0: + all_path_set = set([x for x in current_paths]) + if profile[-1].candidate_paths - all_path_set != set(): + print(prof) + current_slice.add_node(Node("", prof.candidate_paths - all_path_set)) + factory_input.append(current_slice) + return factory_input + + def to_graph(self, profiles: List[Profile]): + factory_input = self.to_slices(profiles) + base_graph = SlicedGraph.load_from_slices(factory_input, self.paths) + return base_graph diff --git a/src/test.py b/src/test.py index c8bae51..ec30b6c 100644 --- a/src/test.py +++ b/src/test.py @@ -3,7 +3,7 @@ from src.gfa import GFA from src.graph import Graph, Slice, Node, NoAnchorError, PathOverlapError, NoOverlapError, NodeMissingError, \ Path, SlicedGraph - +from src.sort import DAGify def G(rep): """Short hand for Graph construction that returns a slice""" @@ -107,16 +107,106 @@ def pf(wd, path): # Define several test example directories PATH_TO_TEST_DATA = pf(WD, "test/") +x,y,z,a = 'x', 'y', 'z', 'a' + +class DAGifyTest(unittest.TestCase): + """ test class of sort.py + """ + + + def test_dagify(self): + gfa = GFA.load_from_gfa("../test/test.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile = dagify.recursive_merge(0) + graph = dagify.to_graph(profile) +# x, y, z = graph.paths['x'], graph.paths['y'], graph.paths['z'] + + self.assertEqual([['CAAATAAG', {x,y,z}], ['A', {y,z}, 'G', {x}], ['C', {x,y,z}], ['TTG', {x,y,z}], ['A', {z}, 'G', {x,y}], ['AAATTTTCTGGAGTTCTAT', {x,y,z}], ['T', {x,y,z}], ['ATAT', {x,y,z}], ['T', {x,y,z}], ['CCAACTCTCTG', {x,y,z}]], graph) + + def test_dagify2(self): + gfa = GFA.load_from_gfa("../test/test2.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile = dagify.recursive_merge(0) + graph = dagify.to_graph(profile) + x,y,z,a = 'x', 'y', 'z', 'a' + self.assertEqual([['CAAATAAG', {x, y, z}], ['G', {x}, 'A', {y, z}], ['C', {x, y}, 'T', {z}], ['TTG', {x, y, z}], ['G', {x, y}, 'A', {a, z}], ['AAATTTTCTGGAGTTCTAT', {a, x, y, z}], ['A', {a, z}, 'T', {x, y}], ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]], graph) + + def test_dagify3(self): + gfa = GFA.load_from_gfa("../test/test3.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual(rep_count, 1) + self.assertEqual(graph, [['CAAATAAG', {x, y}], ['CCAACTCTCTG', {y}, 'G', {x}], ['C', {x, y}], ['TTG', {x, y}], ['G', {x, y}], ['AAATTTTCTGGAGTTCTAT', {x, y}], ['T', {x, y}], ['ATAT', {x, y}], ['T', {x, y}], ['CCAACTCTCTG', {x, y}]]) + + def test_dagify_altpath(self): + gfa = GFA.load_from_gfa("../test/alternate_paths.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual(rep_count, 1) + self.assertEqual(graph, [['CAAATAAG', {x, y}], ['A', {x}, '', {y}], ['G', {x, y}], ['A', {y}, '', {x}], ['T', {x, y}]]) + + def test_dagify_dup(self): + gfa = GFA.load_from_gfa("../test/duplicate.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual(rep_count, 2) + self.assertEqual(graph, [['CAAATAAG', {x, y}], ['', {x}, 'A', {y}], ['', {x}, 'G', {y}], ['A', {x, y}], ['G', {x, y}], ['T', {x, y}]]) + + + def test_unresolved_repreat(self): + gfa = GFA.load_from_gfa("../test/unresolved_repeat.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual([['CAAATAAG', {'x'}, 'T', {'y'}], ['A', {'y', 'x'}], ['G', {'x'}, 'C', {'y'}]], graph) + + @unittest.skip("Inversion is unsupported") + def test_inversion(self): + gfa = GFA.load_from_gfa("../test/inversion.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual(graph, []) + + @unittest.skip("Inversion is unsupported") + def test_nested_inversion(self): + gfa = GFA.load_from_gfa("../test/nested_inv.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual(graph, []) + + @unittest.skip("Inversion is unsupported") + def test_simple_inversion(self): + gfa = GFA.load_from_gfa("../test/simple_inv.gfa") + paths = gfa.to_paths + dagify = DAGify(paths) + profile, rep_count = dagify.search_for_minimizing_replications() + graph = dagify.to_graph(profile) + self.assertEqual(graph, [['CAAATAAG', {x,y}], ['AC', {x}, 'AC', {y}], ['G', {x, y}]]) + + +location_of_xg = "../test/xg" class GFATest(unittest.TestCase): """ test class of gfa.py """ - @unittest.expectedFailure + @unittest.skipIf(not os.path.isfile(location_of_xg), "XG binary is not found.") def test_gfa(self): self.maxDiff = None - location_of_xg = "../test/xg" graph = GFA.load_from_gfa("../test/test.gfa") graph.save_as_xg("../test/test.xg", location_of_xg) graph2 = GFA.load_from_xg("../test/test.xg", location_of_xg) @@ -129,6 +219,15 @@ def test_load_gfa_to_graph(self): self.assertEqual(len(graph.nodes), 15) def test_gfa_to_sliced_graph(self): + graph, gfa = self.make_graph_from_gfa() + slices = SlicedGraph.from_graph(graph) + x = 'x' + y = 'y' + z = 'z' + print(slices) + self.assertEqual(slices, [['CAAATAAG', {x, y, z}], ['A', {y, z}, 'G', {x}], ['C', {x, y, z}], ['TTG', {x, y, z}], ['A', {z}, 'G', {x, y}], ['AAATTTTCTGGAGTTCTAT', {x, y, z}], ['T', {x, y, z}], ['ATAT', {x, y, z}], ['T', {x, y, z}], ['CCAACTCTCTG', {x, y, z}]]) + + def test_gfa_to_sliced_graph_via_dagify(self): #TODO: this is currently close but not quite there. # Slices must be fully defined in SlicedGraph.compute_slices() graph, gfa = self.make_graph_from_gfa() @@ -156,11 +255,11 @@ def test_load_gfa_to_graph_2(self): @unittest.expectedFailure def test_load_gfa_via_xg(self): - location_of_xg = "../test/xg" graph = GFA.load_from_gfa("../test/test.gfa") graph.save_as_xg("../test/test.xg", location_of_xg) graph2 = GFA.load_from_xg("../test/test.xg", location_of_xg) graph = graph2.to_graph + graph = SlicedGraph.from_graph(graph) x = 'x' y = 'y' z = 'z' diff --git a/test/alternately_paths.gfa b/test/alternate_paths.gfa similarity index 100% rename from test/alternately_paths.gfa rename to test/alternate_paths.gfa diff --git a/test/inversion.gfa b/test/inversion.gfa index d5a9e92..4572a75 100644 --- a/test/inversion.gfa +++ b/test/inversion.gfa @@ -7,4 +7,5 @@ L 1 + 3 - 0M S 2 A L 2 + 3 + 0M L 3 + 4 + 0M -S 3 G \ No newline at end of file +S 3 G +S 4 T \ No newline at end of file diff --git a/test/simple_inv.gfa b/test/simple_inv.gfa index de62c6b..9d8d152 100644 --- a/test/simple_inv.gfa +++ b/test/simple_inv.gfa @@ -4,7 +4,7 @@ P y 1+,2-,3+ *,* S 1 CAAATAAG L 1 + 2 + 0M L 1 + 2 - 0M -S 2 A +S 2 AC L 2 - 3 + 0M L 2 + 3 + 0M S 3 G \ No newline at end of file diff --git a/test/test2.gfa b/test/test2.gfa index 52164c1..3d206ef 100644 --- a/test/test2.gfa +++ b/test/test2.gfa @@ -2,7 +2,7 @@ H VN:Z:1.0 P x 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* P y 1+,2+,5+,6+,8+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* P z 1+,2+,4+,6+,7+,9+,10+,12+,14+,15+ *,*,*,*,*,*,*,*,* -P a 12+,13+,15+ *,* +P a 7+,9+,10+ *,* S 1 CAAATAAG L 1 + 2 + 0M L 1 + 3 + 0M diff --git a/test/test3.gfa b/test/test3.gfa new file mode 100644 index 0000000..705218b --- /dev/null +++ b/test/test3.gfa @@ -0,0 +1,38 @@ +H VN:Z:1.0 +P x 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* +P y 1+,15+,5+,6+,8+,9+,11+,12+,14+,15+ *,*,*,*,*,*,*,*,* +S 1 CAAATAAG +L 1 + 15 + 0M +L 1 + 3 + 0M +S 2 A +L 2 + 4 + 0M +L 15 + 5 + 0M +S 3 G +L 3 + 4 + 0M +L 3 + 5 + 0M +S 4 T +L 4 + 6 + 0M +S 5 C +L 5 + 6 + 0M +S 6 TTG +L 6 + 7 + 0M +L 6 + 8 + 0M +S 7 A +L 7 + 9 + 0M +S 8 G +L 8 + 9 + 0M +S 9 AAATTTTCTGGAGTTCTAT +L 9 + 10 + 0M +L 9 + 11 + 0M +S 10 A +L 10 + 12 + 0M +S 11 T +L 11 + 12 + 0M +S 12 ATAT +L 12 + 13 + 0M +L 12 + 14 + 0M +S 13 A +L 13 + 15 + 0M +S 14 T +L 14 + 15 + 0M +S 15 CCAACTCTCTG diff --git a/test/unresolved_repeat.gfa b/test/unresolved_repeat.gfa index 1e18616..6cd4e7c 100644 --- a/test/unresolved_repeat.gfa +++ b/test/unresolved_repeat.gfa @@ -1,6 +1,6 @@ H VN:Z:1.0 P x 1+,2+,3+ *,* -P y 4+,2+,5+ *,*,*,*,*,*,*,*,* +P y 4+,2+,5+ *,* S 1 CAAATAAG L 1 + 2 + 0M L 4 + 2 + 0M