#3 Graph object equality checks use a call hierarchy. Sorted frustrat…

…ions with unordered set equality in string representation.
graph-genome · Jun 20, 2019 · fbd8a5b · fbd8a5b
1 parent e8c11ac
commit fbd8a5b
Showing 1 changed file with 71 additions and 89 deletions.
diff --git a/Graph_Summarization_Prototype.ipynb b/Graph_Summarization_Prototype.ipynb
@@ -45,38 +45,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "c [['ACGT', {1, 2, 3, 4}], ['C', {1, 2, 4}, 'T', {3}], ['GGA', {1, 2, 3, 4}], ['C', {1, 2, 4}, '', {3}], ['AGTACG', {1, 2, 3}, 'CGTACT', {4}], ['TTG', {1, 2, 3, 4}]]\n",
-      "c [['ACGT', [1, 2, 3, 4]], ['C', [1, 2, 4], 'T', [3]], ['GGA', [1, 2, 3, 4]], ['C', [1, 2, 4], '', [3]], ['AGTACG', [1, 2, 3], 'CGTACT', [4]], ['TTG', [1, 2, 3, 4]]]\n",
-      "c [['ACGT', [1, 2, 3, 4]], ['C', [1, 2, 4], 'T', [3]], ['GGA', [1, 2, 3, 4]], ['C', [1, 2, 4], '', [3]], ['AGTACG', [1, 2, 3], 'CGTACT', [4]], ['TTG', [1, 2, 3, 4]]]\n",
-      "c [['ACGT', [1, 2, 3, 4]], ['C', [1, 2, 4], 'T', [3]], ['GGA', [1, 2, 3, 4]], ['C', [1, 2, 4], '', [3]], ['AGTACG', [1, 2, 3], 'CGTACT', [4]], ['TTG', [1, 2, 3, 4]]]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import unittest \n",
     "test = unittest.TestCase()  # just using it for assertRaises\n",
     "from typing import Callable, Iterator, Union, Optional, List, Iterable\n",
     "from collections import namedtuple\n",
-    "# %debug\n",
-    "\n",
+    "from itertools import zip_longest\n",
+    "# %debug"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "#Node = namedtuple('Node', ['seq', 'paths'])\n",
     "class Node:\n",
-    "    def __init__(self, seq: str, paths: List[int]):\n",
+    "    def __init__(self, seq: str, paths: Iterable[int]):\n",
     "        assert isinstance(seq, str), seq\n",
     "        assert not isinstance(paths, str) and isinstance(paths, Iterable), paths\n",
     "        self.seq = seq\n",
-    "        self.paths = paths\n",
+    "        self.paths = set(paths)\n",
     "    def __len__(self):\n",
     "        return len(self.paths)\n",
     "    def __repr__(self):\n",
-    "        return repr(self.seq) + ', ' + repr(sorted(list(self.paths)))\n",
+    "        \"\"\"Paths representation is sorted because set ordering is not guaranteed.\"\"\"\n",
+    "        return repr(self.seq) + \\\n",
+    "        ', {' + ', '.join(str(i) for i in sorted(list(self.paths))) + '}' \n",
+    "    def __eq__(self, other):\n",
+    "        if not isinstance(other, Node):\n",
+    "            print(\"Warn: comparing Node and \", type(other), other)\n",
+    "            return False\n",
+    "        return self.seq == other.seq and self.paths == other.paths\n",
+    "    def __hash__(self):\n",
+    "        return hash(self.seq)\n",
+    "    \n",
     "\n",
     "def merge(self, smaller: Node) -> Node:\n",
     "    m = Node(self.seq, self.paths.union(smaller.paths))\n",
@@ -86,16 +93,26 @@
     "\n",
     "        \n",
     "class Slice:\n",
-    "    def __init__(self, nodes: List[Node]):\n",
-    "        self.nodes = nodes #[nodes] if isinstance(nodes, Node) else nodes\n",
+    "    def __init__(self, nodes: Iterable[Node]):\n",
+    "        self.nodes = set(nodes) \n",
     "    def alternatives(self, main):\n",
     "        return self.nodes.difference({main})\n",
     "    def bystanders(self, first,second):\n",
     "        return self.nodes.difference({first,second})\n",
     "    def __len__(self):\n",
     "        return len(self.nodes)\n",
     "    def __repr__(self):\n",
-    "        return self.nodes.__repr__()# '['+ ','.join(self.paths)+']'\n",
+    "        #return '{' + ', '.join(str(i) for i in sorted(list(self.nodes))) + '}' \n",
+    "        return list(self.nodes).__repr__()# '['+ ','.join(self.paths)+']'\n",
+    "    def __eq__(self, other):\n",
+    "        if isinstance(other, Slice):\n",
+    "            #all(a==b for a,b in zip_longest(self.nodes,other.nodes)) # order dependent\n",
+    "            if not self.nodes == other.nodes:\n",
+    "                print(self.nodes, other.nodes, sep='\\n')\n",
+    "            return self.nodes == other.nodes\n",
+    "        else:\n",
+    "            print(\"Warn: comparing Slice and \", type(other), other)\n",
+    "            return False\n",
     "    \n",
     "    def primary(self):\n",
     "        return max(self.nodes, key=len)  # When they're the same size, take the other\n",
@@ -109,20 +126,20 @@
     "    def __init__(self, cmd: List):\n",
     "        \"\"\"Factory for generating graphs from a representation\"\"\"\n",
     "        self.slices = []\n",
-    "        if cmd[0] and isinstance(cmd[0][0], Node):\n",
+    "        if cmd[0] and isinstance(next(iter(cmd[0])), Node):\n",
     "            self.slices = cmd  # doesn't need to be parsed\n",
     "        else:\n",
     "            if isinstance(cmd, str):\n",
     "                cmd = eval(cmd)\n",
-    "            print('c', cmd)\n",
+    "            #print('c', cmd)\n",
     "            for sl in cmd:\n",
     "                current_slice = []\n",
     "                try:\n",
     "                    for i in range(0, len(sl), 2):\n",
     "                        current_slice.append(Node(sl[i], sl[i+1]))\n",
     "                except IndexError:\n",
     "                    print(\"Expecting two terms: \", sl[i:i+2])\n",
-    "                self.slices.append(current_slice)\n",
+    "                self.slices.append(Slice(current_slice))\n",
     "            \n",
     "    def __repr__(self):\n",
     "        \"\"\"Warning: the representation strings are very sensitive to whitespace\"\"\"\n",
@@ -131,28 +148,36 @@
     "        return self.slices[i]\n",
     "    def __eq__(self, representation):\n",
     "        if isinstance(representation, Graph):\n",
-    "            return str(self) == str(representation)\n",
-    "        return str(self.slices) == str(Graph(representation).slices)\n",
+    "            return all(slice_a == slice_b for slice_a, slice_b in zip_longest(self.slices, representation.slices))\n",
+    "        return self == Graph(representation)  # build a graph then compare it\n",
     "            \n",
     "            \n",
     "            \n",
     "#base_graph = [ [{1,2,3,4}], [{1,2,4},{3}],  [{1,2,3,4}],  [{1,2,4},{3}],  [{1,2,3},{4}],  [{1,2,3,4}] ]\n",
-    "factory_input = [ ['ACGT',{1,2,3,4}], ['C',{1,2,4},'T',{3}],  ['GGA',{1,2,3,4}],  \n",
+    "factory_input = [['ACGT',{1,2,3,4}], ['C',{1,2,4},'T',{3}],  ['GGA',{1,2,3,4}],  \n",
     "              ['C',{1,2,4},'',{3}],  ['AGTACG',{1,2,3},'CGTACT',{4}],  ['TTG',{1,2,3,4}] ]\n",
-    "base_graph = [Slice(Node('ACGT', {1,2,3,4})), \n",
+    "base_graph = [Slice([Node('ACGT', {1,2,3,4})]), \n",
     "              Slice([Node('C',{1,2,4}),Node('T', {3})]),  \n",
     "              Slice([Node('GGA',{1,2,3,4})]),  \n",
     "              Slice([Node('C',{1,2,4}),Node('', {3})]),\n",
     "              Slice([Node('AGTACG',{1,2,3}), Node('CGTACT',{4})]),\n",
-    "              Slice(Node('TTG',{1,2,3,4})) ]\n",
+    "              Slice([Node('TTG',{1,2,3,4})]) ]\n",
     "\n",
     "g = Graph(factory_input)\n",
-    "repr(g) == str([['ACGT',[1,2,3,4]],['C',[1,2,4],'T',[3]],['GGA',[1,2,3,4]],['C',[1,2,4],'',[3]],['AGTACG',[1,2,3],'CGTACT',[4]],['TTG',[1,2,3,4]]])\n",
+    "g\n",
+    "assert g == str(factory_input), ('\\n' + repr(g) + '\\n' + str(factory_input))\n",
     "g_double = Graph(eval(str(g)))\n",
-    "str(g_double) == str(g)\n",
+    "str(g_double) == str(g)  # WARN: could be order sensitive, don't worry if it fails\n",
     "assert g_double == g\n",
-    "assert g_double == [['ACGT',[1,2,3,4]],['C',[1,2,4],'T',[3]],['GGA',[1,2,3,4]],['C',[1,2,4],'',[3]],['AGTACG',[1,2,3],'CGTACT',[4]],['TTG',[1,2,3,4]]]\n",
-    "assert g_double == \"[['ACGT',[1,2,3,4]],['C',[1,2,4],'T',[3]],['GGA',[1,2,3,4]],['C',[1,2,4],'',[3]],['AGTACG',[1,2,3],'CGTACT',[4]],['TTG',[1,2,3,4]]]\""
+    "assert g_double == factory_input\n",
+    "assert g_double == str(factory_input)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Developers Note**: If you need sets with preserved ordering use Python 3.6 dictionaries instead of sets: `dict.fromkeys([67, 46, 55, 39, 94, 63, 34, 32, 57, 54, 67, 36, 63]).keys()`"
    ]
   },
   {
@@ -174,20 +199,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[['C', [1, 2, 3, 4]]]"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "def merge_vertical(one_slice: Slice) -> List[Slice]:\n",
     "    \"\"\"Merges the smallest node into the biggest node.  Preserves bystander nodes as well.\"\"\"\n",
@@ -199,34 +213,24 @@
     "        return Graph([[merger]])\n",
     "    else:  #Allows for third possibilities\n",
     "        return Graph([[merger, *one_slice.bystanders(smallest_node, biggest_node)]])\n",
-    "(merge_vertical(base_graph[1]))# == str([{1, 2, 3, 4}])\n",
+    "assert merge_vertical(base_graph[1]) == [['C', {1, 2, 3, 4}]]\n",
+    "merge_vertical(Graph([['A',{1,2,4}, 'C',{3}, 'T',{12,16}]])[0])\n",
     "# assert merge_vertical([{1,2,4}, {3}, {12,16}]) == [[{1, 2, 3, 4}, {12, 16}]]\n",
     "# assert merge_vertical([{1, 2}, {3, 4}]) == [[{1, 2, 3, 4}]]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(['C', [1, 2, 4], 'T', [3]], __main__.Slice)"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "base_graph[1], type(base_graph[1])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -273,7 +277,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -333,7 +337,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -373,20 +377,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[[{1, 2}, {3}]]"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "def propagate_split(first_slice, second_slice):\n",
     "    \"\"\"When one slice has all paths in one node, and the next slice has the paths split, there are two options.\n",
@@ -426,20 +419,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[[{1, 2, 4}, {3}]]"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "def use_merging(graph, merge_fn, start_index):\n",
     "    \"\"\"Function that executes a given merge in the context of the whole graph then \n",