Skip to content
This repository has been archived by the owner on Mar 20, 2020. It is now read-only.

Commit

Permalink
#3 propagate_split() WIP thinking about third alternatives.
Browse files Browse the repository at this point in the history
  • Loading branch information
josiahseaman committed Jun 11, 2019
1 parent d5efbbf commit 081629f
Showing 1 changed file with 100 additions and 29 deletions.
129 changes: 100 additions & 29 deletions Graph_Summarization_Prototype.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 78,
"metadata": {},
"outputs": [
{
Expand All @@ -54,7 +54,7 @@
"6"
]
},
"execution_count": 1,
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -88,7 +88,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 79,
"metadata": {
"collapsed": true
},
Expand All @@ -100,7 +100,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 80,
"metadata": {
"collapsed": true
},
Expand Down Expand Up @@ -162,32 +162,88 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"def merge_horizontal(first_slice, second_slice, preserve_remainder=False):\n",
"def merge_horizontal(first_slice, second_slice):\n",
" \"\"\"Merges two slices into one by lossy rounding of node membership.\"\"\"\n",
" #TODO: Identify best matching nodes, currently assumes they are in order (this is a bad prototype assumption) \n",
" #TODO: pick union to be the larger of the two atlernatives, assume majority when summarizing variation.\n",
" commonality = first_slice[0].intersection(second_slice[0])\n",
" if not len(commonality):\n",
" if not len(first_slice[0].intersection(second_slice[0])): # union is only valid if they share something\n",
" raise ValueError((\"Intersection of\", first_slice[0], \"and\", second_slice[0], \"is empty!\"))\n",
" if not preserve_remainder:\n",
" alternative_agreement = first_slice[1].union(second_slice[1])\n",
" return [[commonality, alternative_agreement]]\n",
" else: # reducing 4 nodes (2 slices, 2 nodes) to three (1 slice)\n",
" alternative_agreement = first_slice[1].intersection(second_slice[1]) # don't union\n",
" # generate a third node that is the difference between the two, \n",
" # then it could be eaten later by merge_vertical\n",
" losses = first_slice[1].union(second_slice[1]).difference(alternative_agreement)\n",
" return [[commonality, alternative_agreement, losses]]\n",
" \n",
" commonality = first_slice[0].union(second_slice[0])\n",
" if len(first_slice) > 1 and len(second_slice) > 1:\n",
" alternative_agreement = first_slice[1].union(second_slice[1]).difference(commonality) # don't include majority\n",
" if alternative_agreement:\n",
" return [[commonality, alternative_agreement]]\n",
" return [[commonality]]\n",
" #TODO: several other possible configurations of third alternatives that need to be considered\n",
" #History: There was a preserve_remainder option that went from 4 nodes to 3 but it was overly complicated, \n",
" # this became propagate_split()\n",
"path_slip = [ [{1,2,3},{4}], [{1,2},{3,4}], [{1,2,3},{4}] ]\n",
"assert merge_horizontal(path_slip[0], path_slip[1]) == [[{1, 2}, {3, 4}]]\n",
"assert merge_horizontal(path_slip[1], path_slip[2]) == [[{1, 2}, {3, 4}]]\n",
"with test.assertRaises(IndexError):\n",
" merge_horizontal(base_graph[0], base_graph[1])"
"assert merge_horizontal(path_slip[0], path_slip[1]) == [[{1, 2, 3}, {4}]]\n",
"assert merge_horizontal(path_slip[1], path_slip[2]) == [[{1, 2, 3}, {4}]]\n",
"assert merge_horizontal(base_graph[0], base_graph[1]) == [[{1,2,3,4}]]\n",
"with test.assertRaises(ValueError):\n",
" merge_horizontal([{1,2,3}], [{4,5,6}])\n",
"#TODO: # single common path gets aggressively merged (not great)\n",
"assert merge_horizontal([{1,2,3}], [{3,4,5,6}]) == [[{1, 2, 3, 4, 5, 6}]] "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Propagate Split\n",
"Propagate Split was a moved discovered implementing merge_horizontal() in the case where one slice has all paths in one node. The difference is the minimum intersection or maximum union of node membership. Propagate Split is a useful complement to split_vertical() when using a haplotype to cut through neighboring sequence."
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[[{1, 2}, {3}]]"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def propagate_split(first_slice, second_slice):\n",
" \"\"\"When one slice has all paths in one node, and the next slice has the paths split, there are two options.\n",
" You can take the majority and combine all paths into one node, that's what merge_horizontal will do.\n",
" The other path is to use the split slice to split the majority and then merge horizontally, creating\n",
" 2 nodes where there was three. That's what this does.\"\"\"\n",
" #TODO: Identify best matching nodes, currently assumes they are in order (this is a bad prototype assumption) \n",
" #TODO: pick union to be the larger of the two atlernatives, assume majority when summarizing variation.\n",
" \n",
" if min(len(first_slice), len(second_slice)) > 1: # TODO: could guide by slice[0] instead and leave alternatives\n",
" raise ValueError((\"At least one slice needs to have no alternatives\", first_slice[0], \"and\", second_slice[0]))\n",
" if max(len(first_slice), len(second_slice)) <= 1: # Need to have a split to work with\n",
" raise ValueError((\"No alternative nodes detected\", first_slice[0], \"and\", second_slice[0]))\n",
" split_slice = max((first_slice, second_slice), key=len)\n",
" commonality = first_slice[0].intersection(second_slice[0])\n",
" #preserve ordering\n",
" remainder = split_slice[1].difference(commonality) #TODO penalize alternative node if not in both slices\n",
" #TODO: guide_slice[1:] third alternatives etc. iteratively remove commonality\n",
" #TODO: len 1 not all the same: propagate_split(*[[{1, 2, 3}], [{1, 2, 4}]])\n",
" return [[commonality, remainder]]\n",
"fracture = [[{1, 2, 3, 4}], [{1, 2, 4}, {3}], [{1, 2, 3, 4}]]\n",
"assert propagate_split(*fracture[:2]) == [[{1, 2, 4}, {3}]]\n",
"assert propagate_split(*fracture[1:]) == [[{1, 2, 4}, {3}]]\n",
"with test.assertRaises(ValueError):\n",
" propagate_split(*[[{1, 2, 4}, {3}], [{1, 2, 4}, {3}]])\n",
"with test.assertRaises(ValueError):\n",
" propagate_split(*[[{1, 2, 3}], [{1, 2, 4}]])\n",
"propagate_split(*[[{1, 2}, {3},{4}], [{1, 2, 3,4}]]) #Failure case"
]
},
{
Expand All @@ -208,7 +264,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -228,7 +284,7 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -260,9 +316,23 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 84,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "AssertionError",
"evalue": "split_vertical assumes the intervening slice has no alternative nodes",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mAssertionError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-84-3fa46b7eb133>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[1;32massert\u001b[0m \u001b[0muse_merging\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtwo\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmerge_vertical_threesome\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m3\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m4\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 22\u001b[1;33m \u001b[0mhap3\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0muse_merging\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtwo\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplit_vertical\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# Test case where merging into 1 node with no alternatives\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 23\u001b[0m \u001b[0mhap3\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m<ipython-input-84-3fa46b7eb133>\u001b[0m in \u001b[0;36muse_merging\u001b[1;34m(graph, merge_fn, start_index)\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mmerge_horizontal\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m split_vertical:3} [merge_fn]\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0mnew_graph\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmerge_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mgraph\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mstart_index\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mstart_index\u001b[0m\u001b[1;33m+\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[0mnew_graph\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgraph\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mstart_index\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0msize\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;31m#TODO do more than one merge before building a new summary graph for performance\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m<ipython-input-80-827fa02bb056>\u001b[0m in \u001b[0;36msplit_vertical\u001b[1;34m(N0, anchor, N2)\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mN0\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"and\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mN2\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"don't match!\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# Match\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[1;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0manchor\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"split_vertical assumes the intervening slice has no alternative nodes\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[1;31m# Future TODO allow merging across alternative nodes by finding anchor node, subtracting commonality,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;31m# then preserve SNP node -> complex logic for merging sequence in N0 and N2 for third haplotype\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mAssertionError\u001b[0m: split_vertical assumes the intervening slice has no alternative nodes"
]
}
],
"source": [
"def use_merging(graph, merge_fn, start_index):\n",
" \"\"\"Function that executes a given merge in the context of the whole graph then \n",
Expand All @@ -271,6 +341,7 @@
" size = {merge_vertical_threesome: 3, # dictionary of functions = number of slices processed\n",
" merge_vertical: 1,\n",
" merge_horizontal: 2,\n",
" propagate_split: 2,\n",
" split_vertical:3} [merge_fn]\n",
" new_graph.extend(merge_fn(*graph[start_index:start_index+size]))\n",
" new_graph.extend(graph[start_index + size:])\n",
Expand All @@ -279,14 +350,14 @@
"base_graph = [ [{1,2,3,4}], [{1,2,4},{3}], [{1,2,3,4}], [{1,2,4},{3}], [{1,2,3},{4}], [{1,2,3,4}] ]\n",
"one = use_merging(base_graph, split_vertical, 1)\n",
"assert one == [[{1, 2, 3, 4}], [{1, 2, 4}, {3}], [{1, 2, 3}, {4}], [{1, 2, 3, 4}]]\n",
"two = use_merging(one, merge_horizontal, 1)\n",
"assert two == [[{1, 2, 3, 4}], [{1, 2}, {3, 4}], [{1, 2, 3, 4}]]\n",
"two = use_merging(one, merge_horizontal, 2)\n",
"assert two == [[{1, 2, 3, 4}], [{1, 2, 4}, {3}], [{1, 2, 3, 4}]]\n",
"three = use_merging(two, merge_vertical, 1)\n",
"assert three == [[{1, 2, 3, 4}], [{1, 2, 3, 4}], [{1, 2, 3, 4}]] #verifiying intermediate\n",
"assert use_merging(two, merge_vertical_threesome, 0) == [[{1, 2, 3, 4}]]\n",
"\n",
"# hap2 = use_merging(one, merge_horizontal, 2)\n",
"# hap2"
"hap3 = use_merging(two, split_vertical, 0) # Test case where merging into 1 node with no alternatives\n",
"hap3"
]
},
{
Expand Down

0 comments on commit 081629f

Please sign in to comment.