diff --git a/Graph_Summarization_Prototype.ipynb b/Graph_Summarization_Prototype.ipynb index 738361b..78a775a 100644 --- a/Graph_Summarization_Prototype.ipynb +++ b/Graph_Summarization_Prototype.ipynb @@ -45,38 +45,45 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 9, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "c [['ACGT', {1, 2, 3, 4}], ['C', {1, 2, 4}, 'T', {3}], ['GGA', {1, 2, 3, 4}], ['C', {1, 2, 4}, '', {3}], ['AGTACG', {1, 2, 3}, 'CGTACT', {4}], ['TTG', {1, 2, 3, 4}]]\n", - "c [['ACGT', [1, 2, 3, 4]], ['C', [1, 2, 4], 'T', [3]], ['GGA', [1, 2, 3, 4]], ['C', [1, 2, 4], '', [3]], ['AGTACG', [1, 2, 3], 'CGTACT', [4]], ['TTG', [1, 2, 3, 4]]]\n", - "c [['ACGT', [1, 2, 3, 4]], ['C', [1, 2, 4], 'T', [3]], ['GGA', [1, 2, 3, 4]], ['C', [1, 2, 4], '', [3]], ['AGTACG', [1, 2, 3], 'CGTACT', [4]], ['TTG', [1, 2, 3, 4]]]\n", - "c [['ACGT', [1, 2, 3, 4]], ['C', [1, 2, 4], 'T', [3]], ['GGA', [1, 2, 3, 4]], ['C', [1, 2, 4], '', [3]], ['AGTACG', [1, 2, 3], 'CGTACT', [4]], ['TTG', [1, 2, 3, 4]]]\n" - ] - } - ], + "outputs": [], "source": [ "import unittest \n", "test = unittest.TestCase() # just using it for assertRaises\n", "from typing import Callable, Iterator, Union, Optional, List, Iterable\n", "from collections import namedtuple\n", - "# %debug\n", - "\n", + "from itertools import zip_longest\n", + "# %debug" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ "#Node = namedtuple('Node', ['seq', 'paths'])\n", "class Node:\n", - " def __init__(self, seq: str, paths: List[int]):\n", + " def __init__(self, seq: str, paths: Iterable[int]):\n", " assert isinstance(seq, str), seq\n", " assert not isinstance(paths, str) and isinstance(paths, Iterable), paths\n", " self.seq = seq\n", - " self.paths = paths\n", + " self.paths = set(paths)\n", " def __len__(self):\n", " return len(self.paths)\n", " def __repr__(self):\n", - " return repr(self.seq) + ', ' + repr(sorted(list(self.paths)))\n", + " \"\"\"Paths representation is sorted because set ordering is not guaranteed.\"\"\"\n", + " return repr(self.seq) + \\\n", + " ', {' + ', '.join(str(i) for i in sorted(list(self.paths))) + '}' \n", + " def __eq__(self, other):\n", + " if not isinstance(other, Node):\n", + " print(\"Warn: comparing Node and \", type(other), other)\n", + " return False\n", + " return self.seq == other.seq and self.paths == other.paths\n", + " def __hash__(self):\n", + " return hash(self.seq)\n", + " \n", "\n", "def merge(self, smaller: Node) -> Node:\n", " m = Node(self.seq, self.paths.union(smaller.paths))\n", @@ -86,8 +93,8 @@ "\n", " \n", "class Slice:\n", - " def __init__(self, nodes: List[Node]):\n", - " self.nodes = nodes #[nodes] if isinstance(nodes, Node) else nodes\n", + " def __init__(self, nodes: Iterable[Node]):\n", + " self.nodes = set(nodes) \n", " def alternatives(self, main):\n", " return self.nodes.difference({main})\n", " def bystanders(self, first,second):\n", @@ -95,7 +102,17 @@ " def __len__(self):\n", " return len(self.nodes)\n", " def __repr__(self):\n", - " return self.nodes.__repr__()# '['+ ','.join(self.paths)+']'\n", + " #return '{' + ', '.join(str(i) for i in sorted(list(self.nodes))) + '}' \n", + " return list(self.nodes).__repr__()# '['+ ','.join(self.paths)+']'\n", + " def __eq__(self, other):\n", + " if isinstance(other, Slice):\n", + " #all(a==b for a,b in zip_longest(self.nodes,other.nodes)) # order dependent\n", + " if not self.nodes == other.nodes:\n", + " print(self.nodes, other.nodes, sep='\\n')\n", + " return self.nodes == other.nodes\n", + " else:\n", + " print(\"Warn: comparing Slice and \", type(other), other)\n", + " return False\n", " \n", " def primary(self):\n", " return max(self.nodes, key=len) # When they're the same size, take the other\n", @@ -109,12 +126,12 @@ " def __init__(self, cmd: List):\n", " \"\"\"Factory for generating graphs from a representation\"\"\"\n", " self.slices = []\n", - " if cmd[0] and isinstance(cmd[0][0], Node):\n", + " if cmd[0] and isinstance(next(iter(cmd[0])), Node):\n", " self.slices = cmd # doesn't need to be parsed\n", " else:\n", " if isinstance(cmd, str):\n", " cmd = eval(cmd)\n", - " print('c', cmd)\n", + " #print('c', cmd)\n", " for sl in cmd:\n", " current_slice = []\n", " try:\n", @@ -122,7 +139,7 @@ " current_slice.append(Node(sl[i], sl[i+1]))\n", " except IndexError:\n", " print(\"Expecting two terms: \", sl[i:i+2])\n", - " self.slices.append(current_slice)\n", + " self.slices.append(Slice(current_slice))\n", " \n", " def __repr__(self):\n", " \"\"\"Warning: the representation strings are very sensitive to whitespace\"\"\"\n", @@ -131,28 +148,36 @@ " return self.slices[i]\n", " def __eq__(self, representation):\n", " if isinstance(representation, Graph):\n", - " return str(self) == str(representation)\n", - " return str(self.slices) == str(Graph(representation).slices)\n", + " return all(slice_a == slice_b for slice_a, slice_b in zip_longest(self.slices, representation.slices))\n", + " return self == Graph(representation) # build a graph then compare it\n", " \n", " \n", " \n", "#base_graph = [ [{1,2,3,4}], [{1,2,4},{3}], [{1,2,3,4}], [{1,2,4},{3}], [{1,2,3},{4}], [{1,2,3,4}] ]\n", - "factory_input = [ ['ACGT',{1,2,3,4}], ['C',{1,2,4},'T',{3}], ['GGA',{1,2,3,4}], \n", + "factory_input = [['ACGT',{1,2,3,4}], ['C',{1,2,4},'T',{3}], ['GGA',{1,2,3,4}], \n", " ['C',{1,2,4},'',{3}], ['AGTACG',{1,2,3},'CGTACT',{4}], ['TTG',{1,2,3,4}] ]\n", - "base_graph = [Slice(Node('ACGT', {1,2,3,4})), \n", + "base_graph = [Slice([Node('ACGT', {1,2,3,4})]), \n", " Slice([Node('C',{1,2,4}),Node('T', {3})]), \n", " Slice([Node('GGA',{1,2,3,4})]), \n", " Slice([Node('C',{1,2,4}),Node('', {3})]),\n", " Slice([Node('AGTACG',{1,2,3}), Node('CGTACT',{4})]),\n", - " Slice(Node('TTG',{1,2,3,4})) ]\n", + " Slice([Node('TTG',{1,2,3,4})]) ]\n", "\n", "g = Graph(factory_input)\n", - "repr(g) == str([['ACGT',[1,2,3,4]],['C',[1,2,4],'T',[3]],['GGA',[1,2,3,4]],['C',[1,2,4],'',[3]],['AGTACG',[1,2,3],'CGTACT',[4]],['TTG',[1,2,3,4]]])\n", + "g\n", + "assert g == str(factory_input), ('\\n' + repr(g) + '\\n' + str(factory_input))\n", "g_double = Graph(eval(str(g)))\n", - "str(g_double) == str(g)\n", + "str(g_double) == str(g) # WARN: could be order sensitive, don't worry if it fails\n", "assert g_double == g\n", - "assert g_double == [['ACGT',[1,2,3,4]],['C',[1,2,4],'T',[3]],['GGA',[1,2,3,4]],['C',[1,2,4],'',[3]],['AGTACG',[1,2,3],'CGTACT',[4]],['TTG',[1,2,3,4]]]\n", - "assert g_double == \"[['ACGT',[1,2,3,4]],['C',[1,2,4],'T',[3]],['GGA',[1,2,3,4]],['C',[1,2,4],'',[3]],['AGTACG',[1,2,3],'CGTACT',[4]],['TTG',[1,2,3,4]]]\"" + "assert g_double == factory_input\n", + "assert g_double == str(factory_input)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Developers Note**: If you need sets with preserved ordering use Python 3.6 dictionaries instead of sets: `dict.fromkeys([67, 46, 55, 39, 94, 63, 34, 32, 57, 54, 67, 36, 63]).keys()`" ] }, { @@ -174,20 +199,9 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[['C', [1, 2, 3, 4]]]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "def merge_vertical(one_slice: Slice) -> List[Slice]:\n", " \"\"\"Merges the smallest node into the biggest node. Preserves bystander nodes as well.\"\"\"\n", @@ -199,34 +213,24 @@ " return Graph([[merger]])\n", " else: #Allows for third possibilities\n", " return Graph([[merger, *one_slice.bystanders(smallest_node, biggest_node)]])\n", - "(merge_vertical(base_graph[1]))# == str([{1, 2, 3, 4}])\n", + "assert merge_vertical(base_graph[1]) == [['C', {1, 2, 3, 4}]]\n", + "merge_vertical(Graph([['A',{1,2,4}, 'C',{3}, 'T',{12,16}]])[0])\n", "# assert merge_vertical([{1,2,4}, {3}, {12,16}]) == [[{1, 2, 3, 4}, {12, 16}]]\n", "# assert merge_vertical([{1, 2}, {3, 4}]) == [[{1, 2, 3, 4}]]" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(['C', [1, 2, 4], 'T', [3]], __main__.Slice)" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "base_graph[1], type(base_graph[1])" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -273,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -333,7 +337,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -373,20 +377,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[{1, 2}, {3}]]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "def propagate_split(first_slice, second_slice):\n", " \"\"\"When one slice has all paths in one node, and the next slice has the paths split, there are two options.\n", @@ -426,20 +419,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[{1, 2, 4}, {3}]]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "def use_merging(graph, merge_fn, start_index):\n", " \"\"\"Function that executes a given merge in the context of the whole graph then \n",