diff --git a/Graph_Summarization_Prototype.ipynb b/Graph_Summarization_Prototype.ipynb index 879e25d..4125e47 100644 --- a/Graph_Summarization_Prototype.ipynb +++ b/Graph_Summarization_Prototype.ipynb @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 78, "metadata": {}, "outputs": [ { @@ -54,7 +54,7 @@ "6" ] }, - "execution_count": 1, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 79, "metadata": { "collapsed": true }, @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 80, "metadata": { "collapsed": true }, @@ -162,32 +162,88 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ - "def merge_horizontal(first_slice, second_slice, preserve_remainder=False):\n", + "def merge_horizontal(first_slice, second_slice):\n", " \"\"\"Merges two slices into one by lossy rounding of node membership.\"\"\"\n", " #TODO: Identify best matching nodes, currently assumes they are in order (this is a bad prototype assumption) \n", " #TODO: pick union to be the larger of the two atlernatives, assume majority when summarizing variation.\n", - " commonality = first_slice[0].intersection(second_slice[0])\n", - " if not len(commonality):\n", + " if not len(first_slice[0].intersection(second_slice[0])): # union is only valid if they share something\n", " raise ValueError((\"Intersection of\", first_slice[0], \"and\", second_slice[0], \"is empty!\"))\n", - " if not preserve_remainder:\n", - " alternative_agreement = first_slice[1].union(second_slice[1])\n", - " return [[commonality, alternative_agreement]]\n", - " else: # reducing 4 nodes (2 slices, 2 nodes) to three (1 slice)\n", - " alternative_agreement = first_slice[1].intersection(second_slice[1]) # don't union\n", - " # generate a third node that is the difference between the two, \n", - " # then it could be eaten later by merge_vertical\n", - " losses = first_slice[1].union(second_slice[1]).difference(alternative_agreement)\n", - " return [[commonality, alternative_agreement, losses]]\n", + " \n", + " commonality = first_slice[0].union(second_slice[0])\n", + " if len(first_slice) > 1 and len(second_slice) > 1:\n", + " alternative_agreement = first_slice[1].union(second_slice[1]).difference(commonality) # don't include majority\n", + " if alternative_agreement:\n", + " return [[commonality, alternative_agreement]]\n", + " return [[commonality]]\n", " #TODO: several other possible configurations of third alternatives that need to be considered\n", + " #History: There was a preserve_remainder option that went from 4 nodes to 3 but it was overly complicated, \n", + " # this became propagate_split()\n", "path_slip = [ [{1,2,3},{4}], [{1,2},{3,4}], [{1,2,3},{4}] ]\n", - "assert merge_horizontal(path_slip[0], path_slip[1]) == [[{1, 2}, {3, 4}]]\n", - "assert merge_horizontal(path_slip[1], path_slip[2]) == [[{1, 2}, {3, 4}]]\n", - "with test.assertRaises(IndexError):\n", - " merge_horizontal(base_graph[0], base_graph[1])" + "assert merge_horizontal(path_slip[0], path_slip[1]) == [[{1, 2, 3}, {4}]]\n", + "assert merge_horizontal(path_slip[1], path_slip[2]) == [[{1, 2, 3}, {4}]]\n", + "assert merge_horizontal(base_graph[0], base_graph[1]) == [[{1,2,3,4}]]\n", + "with test.assertRaises(ValueError):\n", + " merge_horizontal([{1,2,3}], [{4,5,6}])\n", + "#TODO: # single common path gets aggressively merged (not great)\n", + "assert merge_horizontal([{1,2,3}], [{3,4,5,6}]) == [[{1, 2, 3, 4, 5, 6}]] " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Propagate Split\n", + "Propagate Split was a moved discovered implementing merge_horizontal() in the case where one slice has all paths in one node. The difference is the minimum intersection or maximum union of node membership. Propagate Split is a useful complement to split_vertical() when using a haplotype to cut through neighboring sequence." + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[{1, 2}, {3}]]" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def propagate_split(first_slice, second_slice):\n", + " \"\"\"When one slice has all paths in one node, and the next slice has the paths split, there are two options.\n", + " You can take the majority and combine all paths into one node, that's what merge_horizontal will do.\n", + " The other path is to use the split slice to split the majority and then merge horizontally, creating\n", + " 2 nodes where there was three. That's what this does.\"\"\"\n", + " #TODO: Identify best matching nodes, currently assumes they are in order (this is a bad prototype assumption) \n", + " #TODO: pick union to be the larger of the two atlernatives, assume majority when summarizing variation.\n", + " \n", + " if min(len(first_slice), len(second_slice)) > 1: # TODO: could guide by slice[0] instead and leave alternatives\n", + " raise ValueError((\"At least one slice needs to have no alternatives\", first_slice[0], \"and\", second_slice[0]))\n", + " if max(len(first_slice), len(second_slice)) <= 1: # Need to have a split to work with\n", + " raise ValueError((\"No alternative nodes detected\", first_slice[0], \"and\", second_slice[0]))\n", + " split_slice = max((first_slice, second_slice), key=len)\n", + " commonality = first_slice[0].intersection(second_slice[0])\n", + " #preserve ordering\n", + " remainder = split_slice[1].difference(commonality) #TODO penalize alternative node if not in both slices\n", + " #TODO: guide_slice[1:] third alternatives etc. iteratively remove commonality\n", + " #TODO: len 1 not all the same: propagate_split(*[[{1, 2, 3}], [{1, 2, 4}]])\n", + " return [[commonality, remainder]]\n", + "fracture = [[{1, 2, 3, 4}], [{1, 2, 4}, {3}], [{1, 2, 3, 4}]]\n", + "assert propagate_split(*fracture[:2]) == [[{1, 2, 4}, {3}]]\n", + "assert propagate_split(*fracture[1:]) == [[{1, 2, 4}, {3}]]\n", + "with test.assertRaises(ValueError):\n", + " propagate_split(*[[{1, 2, 4}, {3}], [{1, 2, 4}, {3}]])\n", + "with test.assertRaises(ValueError):\n", + " propagate_split(*[[{1, 2, 3}], [{1, 2, 4}]])\n", + "propagate_split(*[[{1, 2}, {3},{4}], [{1, 2, 3,4}]]) #Failure case" ] }, { @@ -208,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -228,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -260,9 +316,23 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 84, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AssertionError", + "evalue": "split_vertical assumes the intervening slice has no alternative nodes", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[1;32massert\u001b[0m \u001b[0muse_merging\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtwo\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmerge_vertical_threesome\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m3\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m4\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 22\u001b[1;33m \u001b[0mhap3\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0muse_merging\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtwo\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplit_vertical\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# Test case where merging into 1 node with no alternatives\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 23\u001b[0m \u001b[0mhap3\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m\u001b[0m in \u001b[0;36muse_merging\u001b[1;34m(graph, merge_fn, start_index)\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mmerge_horizontal\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m split_vertical:3} [merge_fn]\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0mnew_graph\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmerge_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mgraph\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mstart_index\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mstart_index\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[0mnew_graph\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgraph\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mstart_index\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0msize\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;31m#TODO do more than one merge before building a new summary graph for performance\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m\u001b[0m in \u001b[0;36msplit_vertical\u001b[1;34m(N0, anchor, N2)\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mN0\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"and\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mN2\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"don't match!\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# Match\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[1;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0manchor\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"split_vertical assumes the intervening slice has no alternative nodes\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[1;31m# Future TODO allow merging across alternative nodes by finding anchor node, subtracting commonality,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;31m# then preserve SNP node -> complex logic for merging sequence in N0 and N2 for third haplotype\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mAssertionError\u001b[0m: split_vertical assumes the intervening slice has no alternative nodes" + ] + } + ], "source": [ "def use_merging(graph, merge_fn, start_index):\n", " \"\"\"Function that executes a given merge in the context of the whole graph then \n", @@ -271,6 +341,7 @@ " size = {merge_vertical_threesome: 3, # dictionary of functions = number of slices processed\n", " merge_vertical: 1,\n", " merge_horizontal: 2,\n", + " propagate_split: 2,\n", " split_vertical:3} [merge_fn]\n", " new_graph.extend(merge_fn(*graph[start_index:start_index+size]))\n", " new_graph.extend(graph[start_index + size:])\n", @@ -279,14 +350,14 @@ "base_graph = [ [{1,2,3,4}], [{1,2,4},{3}], [{1,2,3,4}], [{1,2,4},{3}], [{1,2,3},{4}], [{1,2,3,4}] ]\n", "one = use_merging(base_graph, split_vertical, 1)\n", "assert one == [[{1, 2, 3, 4}], [{1, 2, 4}, {3}], [{1, 2, 3}, {4}], [{1, 2, 3, 4}]]\n", - "two = use_merging(one, merge_horizontal, 1)\n", - "assert two == [[{1, 2, 3, 4}], [{1, 2}, {3, 4}], [{1, 2, 3, 4}]]\n", + "two = use_merging(one, merge_horizontal, 2)\n", + "assert two == [[{1, 2, 3, 4}], [{1, 2, 4}, {3}], [{1, 2, 3, 4}]]\n", "three = use_merging(two, merge_vertical, 1)\n", "assert three == [[{1, 2, 3, 4}], [{1, 2, 3, 4}], [{1, 2, 3, 4}]] #verifiying intermediate\n", "assert use_merging(two, merge_vertical_threesome, 0) == [[{1, 2, 3, 4}]]\n", "\n", - "# hap2 = use_merging(one, merge_horizontal, 2)\n", - "# hap2" + "hap3 = use_merging(two, split_vertical, 0) # Test case where merging into 1 node with no alternatives\n", + "hap3" ] }, {