Skip to content
This repository has been archived by the owner on Mar 20, 2020. It is now read-only.

Commit

Permalink
#3 new object notation done. merge_vertical working with sequence. Ne…
Browse files Browse the repository at this point in the history
…xt: add Node.horizontal_merge()
  • Loading branch information
josiahseaman committed Jun 20, 2019
1 parent fbd8a5b commit 981b780
Showing 1 changed file with 74 additions and 49 deletions.
123 changes: 74 additions & 49 deletions Graph_Summarization_Prototype.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -113,33 +113,40 @@
" else:\n",
" print(\"Warn: comparing Slice and \", type(other), other)\n",
" return False\n",
" \n",
" def __iter__(self):\n",
" return iter(self.nodes)\n",
" def primary(self):\n",
" return max(self.nodes, key=len) # When they're the same size, take the other\n",
" biggest = primary # alias\n",
" def secondary(self):\n",
" biggest = self.primary()\n",
" return max((x for x in self.nodes if x != biggest), key=len) # When they're the same size, take the next one\n",
" def smallest(self):\n",
" return min(reversed(self.nodes), key=len) # when they're the same size it will take the last listed\n",
" biggest = self.primary()\n",
" return min((x for x in self.nodes if x != biggest), key=len) # when they're the same size it will take the last listed\n",
"\n",
"class Graph:\n",
" def __init__(self, cmd: List):\n",
" \"\"\"Factory for generating graphs from a representation\"\"\"\n",
" self.slices = []\n",
" if cmd[0] and isinstance(next(iter(cmd[0])), Node):\n",
" self.slices = cmd # doesn't need to be parsed\n",
" else:\n",
" if isinstance(cmd, str):\n",
" cmd = eval(cmd)\n",
" #print('c', cmd)\n",
" for sl in cmd:\n",
" current_slice = []\n",
"# if cmd[0] and isinstance(next(iter(cmd[0])), Node):\n",
"# self.slices = cmd # doesn't need to be parsed\n",
"# else:\n",
" if isinstance(cmd, str):\n",
" cmd = eval(cmd)\n",
" #print('c', cmd)\n",
" for sl in cmd:\n",
" current_slice = []\n",
" if isinstance(sl[0], Node): # already Nodes, don't need to build\n",
" current_slice = sl\n",
" else:\n",
" try:\n",
" for i in range(0, len(sl), 2):\n",
" current_slice.append(Node(sl[i], sl[i+1]))\n",
" except IndexError:\n",
" print(\"Expecting two terms: \", sl[i:i+2])\n",
" self.slices.append(Slice(current_slice))\n",
" raise IndexError(\"Expecting two terms: \", sl[0])#sl[i:i+2])\n",
"\n",
" self.slices.append(Slice(current_slice))\n",
" \n",
" def __repr__(self):\n",
" \"\"\"Warning: the representation strings are very sensitive to whitespace\"\"\"\n",
Expand All @@ -154,21 +161,28 @@
" \n",
" \n",
"#base_graph = [ [{1,2,3,4}], [{1,2,4},{3}], [{1,2,3,4}], [{1,2,4},{3}], [{1,2,3},{4}], [{1,2,3,4}] ]\n",
"factory_input = [['ACGT',{1,2,3,4}], ['C',{1,2,4},'T',{3}], ['GGA',{1,2,3,4}], \n",
" ['C',{1,2,4},'',{3}], ['AGTACG',{1,2,3},'CGTACT',{4}], ['TTG',{1,2,3,4}] ]\n",
"base_graph = [Slice([Node('ACGT', {1,2,3,4})]), \n",
" Slice([Node('C',{1,2,4}),Node('T', {3})]), \n",
" Slice([Node('GGA',{1,2,3,4})]), \n",
" Slice([Node('C',{1,2,4}),Node('', {3})]),\n",
" Slice([Node('AGTACG',{1,2,3}), Node('CGTACT',{4})]),\n",
" Slice([Node('TTG',{1,2,3,4})]) ]\n",
"factory_input = [['ACGT',{1,2,3,4}], \n",
" ['C',{1,2,4},'T',{3}], # SNP\n",
" ['GGA',{1,2,3,4}], # anchor\n",
" ['C',{1,2,4},'',{3}], # repeated from [1] SNP\n",
" ['AGTACG',{1,2,3},'CGTACT',{4}], # different membership from [3]\n",
" ['TTG',{1,2,3,4}], # anchor\n",
" ['A',{1,2}, 'C',{4,5},'T',{3}], # third allele\n",
" ['GG',{1,2}, 'TT',{3,4}], #equal size nodes\n",
" ['TATA',{1,2,3,4}] ] # anchor\n",
"# [Slice([Node('ACGT', {1,2,3,4})]), \n",
"# Slice([Node('C',{1,2,4}),Node('T', {3})]), \n",
"# Slice([Node('GGA',{1,2,3,4})]), \n",
"# Slice([Node('C',{1,2,4}),Node('', {3})]),\n",
"# Slice([Node('AGTACG',{1,2,3}), Node('CGTACT',{4})]),\n",
"# Slice([Node('TTG',{1,2,3,4})]) ]\n",
"\n",
"g = Graph(factory_input)\n",
"g\n",
"assert g == str(factory_input), ('\\n' + repr(g) + '\\n' + str(factory_input))\n",
"g_double = Graph(eval(str(g)))\n",
"str(g_double) == str(g) # WARN: could be order sensitive, don't worry if it fails\n",
"assert g_double == g\n",
"base_graph = Graph(factory_input)\n",
"base_graph\n",
"assert base_graph == str(factory_input), ('\\n' + repr(base_graph) + '\\n' + str(factory_input))\n",
"g_double = Graph(eval(str(base_graph)))\n",
"str(g_double) == str(base_graph) # WARN: could be order sensitive, don't worry if it fails\n",
"assert g_double == base_graph\n",
"assert g_double == factory_input\n",
"assert g_double == str(factory_input)"
]
Expand All @@ -180,6 +194,21 @@
"**Developers Note**: If you need sets with preserved ordering use Python 3.6 dictionaries instead of sets: `dict.fromkeys([67, 46, 55, 39, 94, 63, 34, 32, 57, 54, 67, 36, 63]).keys()`"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"def G(rep):\n",
" \"\"\"Short hand for Graph construction that returns a slice\"\"\"\n",
" if len(rep) > 1:\n",
" raise ValueError(\"Warning: only the first slice will be returned.\", rep)\n",
" return Graph(rep)[0]\n",
"with test.assertRaises(ValueError):\n",
" G([['C', {1, 2, 3, 4}],['T',{12,16}]])"
]
},
{
"attachments": {
"merge_vertical.jpg": {
Expand All @@ -199,13 +228,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 136,
"metadata": {},
"outputs": [],
"source": [
"def merge_vertical(one_slice: Slice) -> List[Slice]:\n",
" \"\"\"Merges the smallest node into the biggest node. Preserves bystander nodes as well.\"\"\"\n",
" assert len(one_slice) > 1\n",
" assert len(one_slice) > 1 and isinstance(one_slice, Slice), (type(one_slice), len(one_slice))\n",
" smallest_node = one_slice.smallest()\n",
" biggest_node = one_slice.primary()\n",
" merger = biggest_node.merge(smallest_node)\n",
Expand All @@ -214,23 +243,14 @@
" else: #Allows for third possibilities\n",
" return Graph([[merger, *one_slice.bystanders(smallest_node, biggest_node)]])\n",
"assert merge_vertical(base_graph[1]) == [['C', {1, 2, 3, 4}]]\n",
"merge_vertical(Graph([['A',{1,2,4}, 'C',{3}, 'T',{12,16}]])[0])\n",
"# assert merge_vertical([{1,2,4}, {3}, {12,16}]) == [[{1, 2, 3, 4}, {12, 16}]]\n",
"# assert merge_vertical([{1, 2}, {3, 4}]) == [[{1, 2, 3, 4}]]"
"assert merge_vertical(base_graph[6])== [['A', {1, 2, 3}, 'C', {4, 5}]]\n",
"# # when nodes are the same size which sequence is preserved is arbitrary \n",
"assert merge_vertical(base_graph[7]) == [['GG', {1, 2, 3, 4}]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"base_graph[1], type(base_graph[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 141,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -239,17 +259,22 @@
" \"\"\"Merges three slices together by dropping out the one variant node that is holding them apart. \n",
" Requires the surrounding anchor nodes to 100% match. Returns new subgraph.\"\"\"\n",
" merged = merge_vertical(target_slice)[0] # [0] selects the only slice present\n",
" left, middle, right = anchor_left.primary(), merged.primary(), anchor_right.primary()\n",
" if len(merged) > 1: # check to ensure there was not a third option\n",
" return [anchor_left, merged, anchor_right] # we were not able to collapse the threesome\n",
" elif anchor_left == anchor_right: # TODO: set equality, not same objects\n",
" if anchor_left == merged:\n",
" return [anchor_left] # all three are equivalent so you can just return one.\n",
" #TODO: include sequence penalties\n",
" elif left.paths == middle.paths == right.paths:\n",
"# step1 = left.horizontal_merge(middle)\n",
"# step2 = step1.horizontal_merge(right)\n",
" #print(left.seq, middle.seq, right.seq)\n",
" n = Node(''.join([left.seq, middle.seq, right.seq]), anchor_left.primary().paths)\n",
" return Graph([[n]]) # all three are equivalent so you can just return one.\n",
" #TODO: include sequence penalties\n",
" raise ValueError(\"All three sets must be equal in order to collapse the slices\")\n",
"assert merge_vertical_threesome(*base_graph[0:3]) == [[{1, 2, 3, 4}]]\n",
"with test.assertRaises(ValueError):\n",
"\n",
"assert merge_vertical_threesome(*base_graph[0:3]) == [['ACGTCGGA', {1, 2, 3, 4}]]\n",
"with test.assertRaises((ValueError, AssertionError)):\n",
" merge_vertical_threesome(*base_graph[1:4])\n",
"with test.assertRaises(ValueError):\n",
"with test.assertRaises((ValueError, AssertionError)):\n",
" merge_vertical_threesome([{1,2,3,4}], [{1,2,4},{3}], [{1}])"
]
},
Expand Down

0 comments on commit 981b780

Please sign in to comment.