diff --git a/Graph_Summarization_Prototype.ipynb b/Graph_Summarization_Prototype.ipynb index 78a775a..55cf41c 100644 --- a/Graph_Summarization_Prototype.ipynb +++ b/Graph_Summarization_Prototype.ipynb @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ @@ -113,33 +113,40 @@ " else:\n", " print(\"Warn: comparing Slice and \", type(other), other)\n", " return False\n", - " \n", + " def __iter__(self):\n", + " return iter(self.nodes)\n", " def primary(self):\n", " return max(self.nodes, key=len) # When they're the same size, take the other\n", + " biggest = primary # alias\n", " def secondary(self):\n", " biggest = self.primary()\n", " return max((x for x in self.nodes if x != biggest), key=len) # When they're the same size, take the next one\n", " def smallest(self):\n", - " return min(reversed(self.nodes), key=len) # when they're the same size it will take the last listed\n", + " biggest = self.primary()\n", + " return min((x for x in self.nodes if x != biggest), key=len) # when they're the same size it will take the last listed\n", "\n", "class Graph:\n", " def __init__(self, cmd: List):\n", " \"\"\"Factory for generating graphs from a representation\"\"\"\n", " self.slices = []\n", - " if cmd[0] and isinstance(next(iter(cmd[0])), Node):\n", - " self.slices = cmd # doesn't need to be parsed\n", - " else:\n", - " if isinstance(cmd, str):\n", - " cmd = eval(cmd)\n", - " #print('c', cmd)\n", - " for sl in cmd:\n", - " current_slice = []\n", + "# if cmd[0] and isinstance(next(iter(cmd[0])), Node):\n", + "# self.slices = cmd # doesn't need to be parsed\n", + "# else:\n", + " if isinstance(cmd, str):\n", + " cmd = eval(cmd)\n", + " #print('c', cmd)\n", + " for sl in cmd:\n", + " current_slice = []\n", + " if isinstance(sl[0], Node): # already Nodes, don't need to build\n", + " current_slice = sl\n", + " else:\n", " try:\n", " for i in range(0, len(sl), 2):\n", " current_slice.append(Node(sl[i], sl[i+1]))\n", " except IndexError:\n", - " print(\"Expecting two terms: \", sl[i:i+2])\n", - " self.slices.append(Slice(current_slice))\n", + " raise IndexError(\"Expecting two terms: \", sl[0])#sl[i:i+2])\n", + "\n", + " self.slices.append(Slice(current_slice))\n", " \n", " def __repr__(self):\n", " \"\"\"Warning: the representation strings are very sensitive to whitespace\"\"\"\n", @@ -154,21 +161,28 @@ " \n", " \n", "#base_graph = [ [{1,2,3,4}], [{1,2,4},{3}], [{1,2,3,4}], [{1,2,4},{3}], [{1,2,3},{4}], [{1,2,3,4}] ]\n", - "factory_input = [['ACGT',{1,2,3,4}], ['C',{1,2,4},'T',{3}], ['GGA',{1,2,3,4}], \n", - " ['C',{1,2,4},'',{3}], ['AGTACG',{1,2,3},'CGTACT',{4}], ['TTG',{1,2,3,4}] ]\n", - "base_graph = [Slice([Node('ACGT', {1,2,3,4})]), \n", - " Slice([Node('C',{1,2,4}),Node('T', {3})]), \n", - " Slice([Node('GGA',{1,2,3,4})]), \n", - " Slice([Node('C',{1,2,4}),Node('', {3})]),\n", - " Slice([Node('AGTACG',{1,2,3}), Node('CGTACT',{4})]),\n", - " Slice([Node('TTG',{1,2,3,4})]) ]\n", + "factory_input = [['ACGT',{1,2,3,4}], \n", + " ['C',{1,2,4},'T',{3}], # SNP\n", + " ['GGA',{1,2,3,4}], # anchor\n", + " ['C',{1,2,4},'',{3}], # repeated from [1] SNP\n", + " ['AGTACG',{1,2,3},'CGTACT',{4}], # different membership from [3]\n", + " ['TTG',{1,2,3,4}], # anchor\n", + " ['A',{1,2}, 'C',{4,5},'T',{3}], # third allele\n", + " ['GG',{1,2}, 'TT',{3,4}], #equal size nodes\n", + " ['TATA',{1,2,3,4}] ] # anchor\n", + "# [Slice([Node('ACGT', {1,2,3,4})]), \n", + "# Slice([Node('C',{1,2,4}),Node('T', {3})]), \n", + "# Slice([Node('GGA',{1,2,3,4})]), \n", + "# Slice([Node('C',{1,2,4}),Node('', {3})]),\n", + "# Slice([Node('AGTACG',{1,2,3}), Node('CGTACT',{4})]),\n", + "# Slice([Node('TTG',{1,2,3,4})]) ]\n", "\n", - "g = Graph(factory_input)\n", - "g\n", - "assert g == str(factory_input), ('\\n' + repr(g) + '\\n' + str(factory_input))\n", - "g_double = Graph(eval(str(g)))\n", - "str(g_double) == str(g) # WARN: could be order sensitive, don't worry if it fails\n", - "assert g_double == g\n", + "base_graph = Graph(factory_input)\n", + "base_graph\n", + "assert base_graph == str(factory_input), ('\\n' + repr(base_graph) + '\\n' + str(factory_input))\n", + "g_double = Graph(eval(str(base_graph)))\n", + "str(g_double) == str(base_graph) # WARN: could be order sensitive, don't worry if it fails\n", + "assert g_double == base_graph\n", "assert g_double == factory_input\n", "assert g_double == str(factory_input)" ] @@ -180,6 +194,21 @@ "**Developers Note**: If you need sets with preserved ordering use Python 3.6 dictionaries instead of sets: `dict.fromkeys([67, 46, 55, 39, 94, 63, 34, 32, 57, 54, 67, 36, 63]).keys()`" ] }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "def G(rep):\n", + " \"\"\"Short hand for Graph construction that returns a slice\"\"\"\n", + " if len(rep) > 1:\n", + " raise ValueError(\"Warning: only the first slice will be returned.\", rep)\n", + " return Graph(rep)[0]\n", + "with test.assertRaises(ValueError):\n", + " G([['C', {1, 2, 3, 4}],['T',{12,16}]])" + ] + }, { "attachments": { "merge_vertical.jpg": { @@ -199,13 +228,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 136, "metadata": {}, "outputs": [], "source": [ "def merge_vertical(one_slice: Slice) -> List[Slice]:\n", " \"\"\"Merges the smallest node into the biggest node. Preserves bystander nodes as well.\"\"\"\n", - " assert len(one_slice) > 1\n", + " assert len(one_slice) > 1 and isinstance(one_slice, Slice), (type(one_slice), len(one_slice))\n", " smallest_node = one_slice.smallest()\n", " biggest_node = one_slice.primary()\n", " merger = biggest_node.merge(smallest_node)\n", @@ -214,23 +243,14 @@ " else: #Allows for third possibilities\n", " return Graph([[merger, *one_slice.bystanders(smallest_node, biggest_node)]])\n", "assert merge_vertical(base_graph[1]) == [['C', {1, 2, 3, 4}]]\n", - "merge_vertical(Graph([['A',{1,2,4}, 'C',{3}, 'T',{12,16}]])[0])\n", - "# assert merge_vertical([{1,2,4}, {3}, {12,16}]) == [[{1, 2, 3, 4}, {12, 16}]]\n", - "# assert merge_vertical([{1, 2}, {3, 4}]) == [[{1, 2, 3, 4}]]" + "assert merge_vertical(base_graph[6])== [['A', {1, 2, 3}, 'C', {4, 5}]]\n", + "# # when nodes are the same size which sequence is preserved is arbitrary \n", + "assert merge_vertical(base_graph[7]) == [['GG', {1, 2, 3, 4}]]" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "base_graph[1], type(base_graph[1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 141, "metadata": {}, "outputs": [], "source": [ @@ -239,17 +259,22 @@ " \"\"\"Merges three slices together by dropping out the one variant node that is holding them apart. \n", " Requires the surrounding anchor nodes to 100% match. Returns new subgraph.\"\"\"\n", " merged = merge_vertical(target_slice)[0] # [0] selects the only slice present\n", + " left, middle, right = anchor_left.primary(), merged.primary(), anchor_right.primary()\n", " if len(merged) > 1: # check to ensure there was not a third option\n", " return [anchor_left, merged, anchor_right] # we were not able to collapse the threesome\n", - " elif anchor_left == anchor_right: # TODO: set equality, not same objects\n", - " if anchor_left == merged:\n", - " return [anchor_left] # all three are equivalent so you can just return one.\n", - " #TODO: include sequence penalties\n", + " elif left.paths == middle.paths == right.paths:\n", + "# step1 = left.horizontal_merge(middle)\n", + "# step2 = step1.horizontal_merge(right)\n", + " #print(left.seq, middle.seq, right.seq)\n", + " n = Node(''.join([left.seq, middle.seq, right.seq]), anchor_left.primary().paths)\n", + " return Graph([[n]]) # all three are equivalent so you can just return one.\n", + " #TODO: include sequence penalties\n", " raise ValueError(\"All three sets must be equal in order to collapse the slices\")\n", - "assert merge_vertical_threesome(*base_graph[0:3]) == [[{1, 2, 3, 4}]]\n", - "with test.assertRaises(ValueError):\n", + "\n", + "assert merge_vertical_threesome(*base_graph[0:3]) == [['ACGTCGGA', {1, 2, 3, 4}]]\n", + "with test.assertRaises((ValueError, AssertionError)):\n", " merge_vertical_threesome(*base_graph[1:4])\n", - "with test.assertRaises(ValueError):\n", + "with test.assertRaises((ValueError, AssertionError)):\n", " merge_vertical_threesome([{1,2,3,4}], [{1,2,4},{3}], [{1}])" ] },