diff --git a/DB_investigation.ipynb b/DB_investigation.ipynb
new file mode 100644
index 0000000..025dc89
--- /dev/null
+++ b/DB_investigation.ipynb
@@ -0,0 +1,798 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9368bb63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Want to start by checking which molecules are duplicates.\n",
+    "# For examples we have 241905 and 1497"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e28921c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import psycopg2\n",
+    "import pandas as pd\n",
+    "from rdkit import Chem"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "214277f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a128c9e3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_556/3175014960.py:16: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
+      "  df = pd.read_sql_query(query, connection)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Establish a connection\n",
+    "# You must have the DB container running to run this cell successfully.\n",
+    "# Connection parameters\n",
+    "db_params = {\n",
+    "    'dbname': 'postgres',\n",
+    "    'user': 'postgres',\n",
+    "    'password': '',\n",
+    "    'host': '127.0.0.1',\n",
+    "    'port': '5432'\n",
+    "}\n",
+    "\n",
+    "# Establish a connection to the PostgreSQL database\n",
+    "connection = psycopg2.connect(**db_params)\n",
+    "\n",
+    "# Execute an SQL statement\n",
+    "query = \"SELECT molecule_id, smiles, molecular_weight FROM molecule\"\n",
+    "df = pd.read_sql_query(query, connection)\n",
+    "\n",
+    "# Close the connection\n",
+    "connection.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "686a1a35",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>molecule_id</th>\n",
+       "      <th>smiles</th>\n",
+       "      <th>molecular_weight</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>331406</td>\n",
+       "      <td>COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC</td>\n",
+       "      <td>398.441986</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>140360</td>\n",
+       "      <td>COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1</td>\n",
+       "      <td>398.532990</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>331409</td>\n",
+       "      <td>C1CCC(CC1)P(c1ccccc1)C1CCCCC1</td>\n",
+       "      <td>274.388000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2027</td>\n",
+       "      <td>CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...</td>\n",
+       "      <td>497.707001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2036</td>\n",
+       "      <td>CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...</td>\n",
+       "      <td>1049.558960</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330962</th>\n",
+       "      <td>608</td>\n",
+       "      <td>Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1</td>\n",
+       "      <td>346.410004</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330963</th>\n",
+       "      <td>461</td>\n",
+       "      <td>CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC</td>\n",
+       "      <td>376.509003</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330964</th>\n",
+       "      <td>1064</td>\n",
+       "      <td>Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...</td>\n",
+       "      <td>462.666992</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330965</th>\n",
+       "      <td>523</td>\n",
+       "      <td>CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...</td>\n",
+       "      <td>490.664001</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>330966</th>\n",
+       "      <td>1817</td>\n",
+       "      <td>COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC</td>\n",
+       "      <td>442.447998</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>330967 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        molecule_id                                             smiles  \\\n",
+       "0            331406          COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC   \n",
+       "1            140360             COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1   \n",
+       "2            331409                      C1CCC(CC1)P(c1ccccc1)C1CCCCC1   \n",
+       "3              2027  CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...   \n",
+       "4              2036  CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...   \n",
+       "...             ...                                                ...   \n",
+       "330962          608         Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1   \n",
+       "330963          461             CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC   \n",
+       "330964         1064  Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...   \n",
+       "330965          523  CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...   \n",
+       "330966         1817      COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC   \n",
+       "\n",
+       "        molecular_weight  \n",
+       "0             398.441986  \n",
+       "1             398.532990  \n",
+       "2             274.388000  \n",
+       "3             497.707001  \n",
+       "4            1049.558960  \n",
+       "...                  ...  \n",
+       "330962        346.410004  \n",
+       "330963        376.509003  \n",
+       "330964        462.666992  \n",
+       "330965        490.664001  \n",
+       "330966        442.447998  \n",
+       "\n",
+       "[330967 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "149e8d9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>molecule_id</th>\n",
+       "      <th>smiles</th>\n",
+       "      <th>molecular_weight</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>143223</th>\n",
+       "      <td>241905</td>\n",
+       "      <td>[H]P([H])C</td>\n",
+       "      <td>48.025002</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>329868</th>\n",
+       "      <td>1497</td>\n",
+       "      <td>CP</td>\n",
+       "      <td>48.025002</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        molecule_id      smiles  molecular_weight\n",
+       "143223       241905  [H]P([H])C         48.025002\n",
+       "329868         1497          CP         48.025002"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Sanity check to see if the data is correct\n",
+    "df[(df[\"molecule_id\"]==241905) | (df[\"molecule_id\"]==1497)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "9596cb12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# These two molecules are the same so lets check if rdkit will return the same smiles string when canonicalizing them\n",
+    "mol_241905 = df[df[\"molecule_id\"]==241905][\"smiles\"].to_list()[0]\n",
+    "mol_1497 = df[df[\"molecule_id\"]==1497][\"smiles\"].to_list()[0]\n",
+    "\n",
+    "# Double check with the molecular weight. Use difference is less than some tolerance 1e-6.\n",
+    "mol_241905_weight = df[df[\"molecule_id\"]==241905][\"molecular_weight\"].to_list()[0]\n",
+    "mol_1497_weight = df[df[\"molecule_id\"]==1497][\"molecular_weight\"].to_list()[0]\n",
+    "\n",
+    "a = Chem.CanonSmiles(mol_241905)\n",
+    "b = Chem.CanonSmiles(mol_241905)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "9c94d293",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CP 48.025001525878906\n",
+      "CP 48.025001525878906\n",
+      "True\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(a, mol_241905_weight)\n",
+    "print(b, mol_1497_weight)\n",
+    "print(a == b)\n",
+    "print(abs(mol_241905_weight - mol_1497_weight) < 0.000001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "5cca5c81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_data_dict = df.to_dict(orient=\"records\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "7ea5a56b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                                                        | 0/330967 [00:00<?, ?it/s][23:43:15] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
+      "  0%|▏                                                                           | 653/330967 [00:00<01:38, 3339.89it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "c1ccc(cc1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)c1ccccc1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|▌                                                                          | 2331/330967 [00:00<01:20, 4099.64it/s][23:43:15] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
+      "  1%|▋                                                                          | 3150/330967 [00:00<01:24, 3897.22it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "c1ccc(cc1)P([C]1234[BH]567[BH]891[BH]1%103[BH]3%112[BH]245[BH]456[CH]678[BH]791[BH]1%10%11[BH]324[BH]5671)c1ccccc1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  2%|█▎                                                                         | 5580/330967 [00:01<01:22, 3940.69it/s][23:43:16] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
+      "[23:43:16] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
+      "  2%|█▍                                                                         | 6369/330967 [00:01<01:23, 3905.70it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "C1CCC(CC1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)C1CCCCC1\n",
+      "c1ccc(cc1)P([C]1234[CH]567[BH]893[BH]3%102[BH]2%111[BH]145[BH]456[BH]678[BH]79%10[BH]83%11[BH]214[BH]5678)c1ccccc1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████████████████████████████████████| 330967/330967 [02:02<00:00, 2711.21it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Store each record in a hashmap with the CANONICAL smile as the key, and value as a tuple of molecule_id and molecular_weight\n",
+    "# When storing, check to see if the CANONICAL smile is already there, if so check its weight and add the tuple of molecule_ids\n",
+    "# the original  \n",
+    "hashmap = {}\n",
+    "duplicates = []\n",
+    "for entry in tqdm(all_data_dict):\n",
+    "    try:\n",
+    "        # Canonicalize smiles\n",
+    "        canonical_smile = Chem.CanonSmiles(entry[\"smiles\"])\n",
+    "        # Check if its in hashmap\n",
+    "        if canonical_smile not in hashmap:\n",
+    "            # add it\n",
+    "            hashmap[canonical_smile] = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
+    "        else:\n",
+    "            a = hashmap[canonical_smile] # Get matched molecule tuple data\n",
+    "            b = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
+    "            duplicates.append((a, b))\n",
+    "    except:\n",
+    "        # See which molecules cause issues if any, deal with them later\n",
+    "        print(entry[\"smiles\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "361ec9af",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "78"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(duplicates)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "158c8d2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[((1519, 'COP', 64.02400207519531),\n",
+       "  (241851, '[H]P([H])OC', 64.02400207519531)),\n",
+       " ((241739, '[H]P([H])C(C)C', 76.0790023803711),\n",
+       "  (1495, 'CC(C)P', 76.0790023803711)),\n",
+       " ((2045, 'C1CC2CCCC(C1)P2B1Nc2ccccc2c2c1cccc2', 319.1969909667969),\n",
+       "  (1006, 'c1ccc2c(c1)NB(P1C3CCCC1CCC3)c1ccccc1-2', 319.1969909667969)),\n",
+       " ((1521, 'CSP', 80.09200286865234),\n",
+       "  (241909, '[H]P([H])SC', 80.09200286865234)),\n",
+       " ((2063,\n",
+       "   'COC1=CC=C(OC)C(C2=C(C(C)C)C=C(C(C)C)C=C2C(C)C)=C1P3C(C)(C)CC4(OCCO4)CC3(C)C',\n",
+       "   554.7520141601562),\n",
+       "  (1900,\n",
+       "   'COc1ccc(OC)c(P2C(C)(C)CC3(CC2(C)C)OCCO3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
+       "   554.7520141601562)),\n",
+       " ((1494, 'CC(C)PC(C)C', 118.16000366210938),\n",
+       "  (41951, '[H]P(C(C)C)C(C)C', 118.16000366210938)),\n",
+       " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
+       "  (1296, 'PC12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
+       " ((820,\n",
+       "   'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@]57)OC8=[C@]7C(C=CC=C9)=C9C=C8',\n",
+       "   598.6380004882812),\n",
+       "  (821,\n",
+       "   'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@@]57)OC8=[C@@]7C(C=CC=C9)=C9C=C8',\n",
+       "   598.6380004882812)),\n",
+       " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
+       "  (1295, 'C1C2CC3CC1CC(PC14CC5CC(CC(C5)C1)C4)(C2)C3', 302.4419860839844)),\n",
+       " ((213971, '[H]P(F)F', 69.97799682617188), (1293, 'FPF', 69.97799682617188)),\n",
+       " ((150328, '[H]P([Si](C)(C)C)[Si](C)(C)C', 178.36399841308594),\n",
+       "  (1520, 'C[Si](C)(C)P[Si](C)(C)C', 178.36399841308594)),\n",
+       " ((2020,\n",
+       "   'c1ccc(cc1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   454.63800048828125),\n",
+       "  (306,\n",
+       "   'c1ccc(-c2ccccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   454.63800048828125)),\n",
+       " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
+       "  (252420,\n",
+       "   '[H]P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   302.4419860839844)),\n",
+       " ((1491, 'Pc1ccco1', 100.05699920654297),\n",
+       "  (242228, '[H]P([H])c1ccco1', 100.05699920654297)),\n",
+       " ((2033,\n",
+       "   'CCCCc1c(F)c(F)c(c(c1F)F)c1c(cc(c(c1C(C)C)c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC)C(C)C)C(C)C',\n",
+       "   815.072998046875),\n",
+       "  (369,\n",
+       "   'CCCCc1c(F)c(F)c(-c2c(C(C)C)cc(C(C)C)c(-c3cccc(OC)c3P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)c2C(C)C)c(F)c1F',\n",
+       "   815.072998046875)),\n",
+       " ((1297, 'CC(C)(C)PC(C)(C)C', 146.21400451660156),\n",
+       "  (20221, '[H]P(C(C)(C)C)C(C)(C)C', 146.21400451660156)),\n",
+       " ((2044, 'CC(P(C(C)(C)C)C1C2CC3CC1CC(C2)C3)(C)C', 280.4360046386719),\n",
+       "  (952, 'CC(C)(C)P(C1C2CC3CC(C2)CC1C3)C(C)(C)C', 280.4360046386719)),\n",
+       " ((331415, 'P(CCCCCCCC)(CCCCCCCC)CCCCCCCC', 370.64599609375),\n",
+       "  (239, 'CCCCCCCCP(CCCCCCCC)CCCCCCCC', 370.64599609375)),\n",
+       " ((1493, 'CCP', 62.051998138427734),\n",
+       "  (241815, '[H]P([H])CC', 62.051998138427734)),\n",
+       " ((1518, 'CN(C)PN(C)C', 120.13600158691406),\n",
+       "  (91817, '[H]P(N(C)C)N(C)C', 120.13600158691406)),\n",
+       " ((1294, 'FP', 51.987998962402344),\n",
+       "  (242039, '[H]P([H])F', 51.987998962402344)),\n",
+       " ((2038, 'C1C2CC3CC1CC(C2)(C3)P(c1ccccc1)c1ccccc1', 320.4159851074219),\n",
+       "  (576, 'c1ccc(P(c2ccccc2)C23CC4CC(CC(C4)C2)C3)cc1', 320.4159851074219)),\n",
+       " ((1487, 'Pc1ccc2c(c1)CCCC2', 164.18800354003906),\n",
+       "  (242188, '[H]P([H])c1ccc2c(c1)CCCC2', 164.18800354003906)),\n",
+       " ((2034,\n",
+       "   'O1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   463.64599609375),\n",
+       "  (371,\n",
+       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCOCC2)c1',\n",
+       "   463.64599609375)),\n",
+       " ((331412, 'CC(C=CC=C1C)=C1P(C2=CC=CC=C2)C3=CC=CC=C3', 290.34600830078125),\n",
+       "  (203, 'Cc1cccc(C)c1P(c1ccccc1)c1ccccc1', 290.34600830078125)),\n",
+       " ((2043, 'CC(P(C12CC3CC(C2)CC(C1)C3)C(C)(C)C)(C)C', 280.4360046386719),\n",
+       "  (951, 'CC(C)(C)P(C(C)(C)C)C12CC3CC(CC(C3)C1)C2', 280.4360046386719)),\n",
+       " ((1492, 'CCPCC', 90.10600280761719),\n",
+       "  (85517, '[H]P(CC)CC', 90.10600280761719)),\n",
+       " ((1488, 'PC12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281),\n",
+       "  (242107, '[H]P([H])C12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281)),\n",
+       " ((2064,\n",
+       "   'COC1=CC=C(C2=C(OC)C=CC3=C2C=CC=C3)C(OC)=C1P(C4CCCCC4)C5CCCCC5',\n",
+       "   490.6239929199219),\n",
+       "  (1901,\n",
+       "   'COc1ccc(-c2c(OC)ccc3ccccc23)c(OC)c1P(C1CCCCC1)C1CCCCC1',\n",
+       "   490.6239929199219)),\n",
+       " ((331406, 'COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC', 398.4419860839844),\n",
+       "  (7, 'COc1cccc(OC)c1-c1ccccc1P(c1ccccc1)c1ccccc1', 398.4419860839844)),\n",
+       " ((1485, 'Pc1ccc2ccccc2c1', 160.156005859375),\n",
+       "  (242192, '[H]P([H])c1ccc2ccccc2c1', 160.156005859375)),\n",
+       " ((1496, 'CPC', 62.051998138427734), (137130, '[H]P(C)C', 62.051998138427734)),\n",
+       " ((2052,\n",
+       "   'CC(c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C(C)C)C',\n",
+       "   462.7019958496094),\n",
+       "  (1274,\n",
+       "   'CC(C)c1cccc(C(C)C)c1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   462.7019958496094)),\n",
+       " ((2025,\n",
+       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1cccc2c1cccc2',\n",
+       "   456.6099853515625),\n",
+       "  (348,\n",
+       "   'O=C(c1cccc2ccccc12)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   456.6099853515625)),\n",
+       " ((575, 'Pc1ccccc1', 110.09600067138672),\n",
+       "  (242207, '[H]P([H])c1ccccc1', 110.09600067138672)),\n",
+       " ((2026,\n",
+       "   'CC([Si](C(C)C)(C(C)C)Oc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+       "   550.8839721679688),\n",
+       "  (349,\n",
+       "   'CC(C)[Si](Oc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2)(C(C)C)C(C)C',\n",
+       "   550.8839721679688)),\n",
+       " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
+       "  (242106, '[H]P([H])C12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
+       " ((2048, 'COCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3', 360.5220031738281),\n",
+       "  (1140,\n",
+       "   'COCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   360.5220031738281)),\n",
+       " ((2051, 'O=Cc1ccc(cc1)P(c1ccc(cc1)C=O)C1C2CC3CC1CC(C2)C3', 376.4360046386719),\n",
+       "  (1241,\n",
+       "   'O=Cc1ccc(P(c2ccc(C=O)cc2)C2C3CC4CC(C3)CC2C4)cc1',\n",
+       "   376.4360046386719)),\n",
+       " ((2024,\n",
+       "   'COc1ccc(c(c1c1c(cc(cc1C(C)C)C(C)C)C(C)C)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC',\n",
+       "   640.9329833984375),\n",
+       "  (347,\n",
+       "   'COc1ccc(OC)c(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
+       "   640.9329833984375)),\n",
+       " ((2047,\n",
+       "   'COc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   408.5660095214844),\n",
+       "  (1137,\n",
+       "   'COc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   408.5660095214844)),\n",
+       " ((2016,\n",
+       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)Cc1ccccc1',\n",
+       "   392.5669860839844),\n",
+       "  (64,\n",
+       "   'c1ccc(CP(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   392.5669860839844)),\n",
+       " ((2022,\n",
+       "   'c1ccc(cc1)c1nn(c(c1n1nccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1)c1ccccc1',\n",
+       "   662.8619995117188),\n",
+       "  (340,\n",
+       "   'c1ccc(-c2nn(-c3ccccc3)c(-c3ccccc3)c2-n2nccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   662.8619995117188)),\n",
+       " ((2014,\n",
+       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   436.66400146484375),\n",
+       "  (6,\n",
+       "   'C1C2CC3CC1CC(P(C14CC5CC(CC(C5)C1)C4)C14CC5CC(CC(C5)C1)C4)(C2)C3',\n",
+       "   436.66400146484375)),\n",
+       " ((2037,\n",
+       "   'CN(c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+       "   421.6090087890625),\n",
+       "  (527,\n",
+       "   'CN(C)c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   421.6090087890625)),\n",
+       " ((1490, 'Pc1cccs1', 116.125), (242229, '[H]P([H])c1cccs1', 116.125)),\n",
+       " ((331421, 'CCO[Si](OCC)(OCC)CCCP(c1ccccc1)c2ccccc2', 390.5360107421875),\n",
+       "  (728, 'CCO[Si](CCCP(c1ccccc1)c1ccccc1)(OCC)OCC', 390.5360107421875)),\n",
+       " ((1486, 'Pc1cccc2ccccc12', 160.156005859375),\n",
+       "  (242206, '[H]P([H])c1cccc2ccccc12', 160.156005859375)),\n",
+       " ((2021,\n",
+       "   'C1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   461.67401123046875),\n",
+       "  (338,\n",
+       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCCCC2)c1',\n",
+       "   461.67401123046875)),\n",
+       " ((2015,\n",
+       "   'CCCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   358.54998779296875),\n",
+       "  (10, 'CCCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2', 358.54998779296875)),\n",
+       " ((2065,\n",
+       "   'C12=CC=CC=C1C(C3=C(C4=C(P(C5CCCCC5)C6CCCCC6)C=CC=C4)C(C=CC=C7)=C7C=C3)=C8C(C=CC=C8)=C2',\n",
+       "   576.7639770507812),\n",
+       "  (1902,\n",
+       "   'c1ccc(P(C2CCCCC2)C2CCCCC2)c(-c2c(-c3c4ccccc4cc4ccccc34)ccc3ccccc23)c1',\n",
+       "   576.7639770507812)),\n",
+       " ((2027,\n",
+       "   'CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+       "   497.7070007324219),\n",
+       "  (350,\n",
+       "   'CN(C)c1ccccc1-c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   497.7070007324219)),\n",
+       " ((331409, 'C1CCC(CC1)P(c1ccccc1)C1CCCCC1', 274.38800048828125),\n",
+       "  (68, 'c1ccc(P(C2CCCCC2)C2CCCCC2)cc1', 274.38800048828125)),\n",
+       " ((2017,\n",
+       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C(=O)Nc1ccccc1)Nc1ccccc1',\n",
+       "   406.46600341796875),\n",
+       "  (259,\n",
+       "   'O=C(Nc1ccccc1)P(C(=O)Nc1ccccc1)C12CC3CC(CC(C3)C1)C2',\n",
+       "   406.46600341796875)),\n",
+       " ((2023,\n",
+       "   'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
+       "   406.54998779296875),\n",
+       "  (346,\n",
+       "   'O=C(c1ccccc1)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   406.54998779296875)),\n",
+       " ((574, 'c1ccc(Pc2ccccc2)cc1', 186.19400024414062),\n",
+       "  (310083, '[H]P(c1ccccc1)c1ccccc1', 186.19400024414062)),\n",
+       " ((42, 'CC(C)(C)P(c1ccccc1-c1ccccc1)C(C)(C)C', 298.4100036621094),\n",
+       "  (331407, 'CC(P(C(C)(C)C)c1ccccc1c1ccccc1)(C)C', 298.4100036621094)),\n",
+       " ((103,\n",
+       "   'COc1ccc(C)c(-c2c(C(C)C)cc(C(C)C)cc2C(C)C)c1P(C(C)(C)C)C(C)(C)C',\n",
+       "   468.70599365234375),\n",
+       "  (331410,\n",
+       "   'COc1ccc(c(c1P(C(C)(C)C)C(C)(C)C)c1c(cc(cc1C(C)C)C(C)C)C(C)C)C',\n",
+       "   468.70599365234375)),\n",
+       " ((116, 'CN(C)c1ccc(P(c2ccccc2)c2ccccc2)cc1', 305.3609924316406),\n",
+       "  (331411, 'CN(c1ccc(cc1)P(c1ccccc1)c1ccccc1)C', 305.3609924316406)),\n",
+       " ((246,\n",
+       "   'COc1c(C)cc(P(c2cc(C)c(OC)c(C)c2)c2cc(C)c(OC)c(C)c2)cc1C',\n",
+       "   436.5320129394531),\n",
+       "  (331416,\n",
+       "   'P(c1cc(c(c(c1)C)OC)C)(c1cc(c(c(c1)C)OC)C)c1cc(c(c(c1)C)OC)C',\n",
+       "   436.5320129394531)),\n",
+       " ((234,\n",
+       "   'Fc1c(F)c(F)c(P(c2ccccc2)c2c(F)c(F)c(F)c(F)c2F)c(F)c1F',\n",
+       "   442.1919860839844),\n",
+       "  (331414,\n",
+       "   'Fc1c(c(c(c(c1F)F)P(c1c(c(c(c(c1F)F)F)F)F)c1ccccc1)F)F',\n",
+       "   442.1919860839844)),\n",
+       " ((356, 'C/C=C/CP(C(C)(C)C)C(C)(C)C', 200.30599975585938),\n",
+       "  (331419, 'P(C(C)(C)C)(C(C)(C)C)C/C=C/C', 200.30599975585938)),\n",
+       " ((487,\n",
+       "   'FC(F)(F)C(F)(F)C(F)(F)C(Cc1ccc(P(c2ccccc2)c2ccc(CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)cc2)cc1)(C(F)(F)F)C(F)(F)F',\n",
+       "   926.4099731445312),\n",
+       "  (331420,\n",
+       "   'FC(F)(F)C(F)(F)C(F)(F)C(C(F)(F)F)(C(F)(F)F)Cc1ccc(cc1)P(c3ccc(cc3)CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)c2ccccc2',\n",
+       "   926.4099731445312)),\n",
+       " ((216, 'CN(C)c1ccc(P(C(C)(C)C)C(C)(C)C)cc1', 265.3810119628906),\n",
+       "  (331413, 'CC(P(C1=CC=C(N(C)C)C=C1)C(C)(C)C)(C)C', 265.3810119628906)),\n",
+       " ((298, 'c1ccc(P2Cc3cccc4c3C3(CC4)CCc4cccc(c43)C2)cc1', 354.4330139160156),\n",
+       "  (331417, 'P1(Cc2c3c(ccc2)CCC23CCc3c2c(ccc3)C1)c1ccccc1', 354.4330139160156)),\n",
+       " ((320, 'COc1cc(OC)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2)c(OC)c1', 440.5639953613281),\n",
+       "  (331418,\n",
+       "   'P(C1CCCCC1)(C1CCCCC1)c1c(cccc1)c1c(cc(cc1OC)OC)OC',\n",
+       "   440.5639953613281)),\n",
+       " ((3, 'COc1cccc(OC)c1-c1ccccc1P(C1CCCCC1)C1CCCCC1', 410.5379943847656),\n",
+       "  (331405, 'COc1cccc(c1c1ccccc1P(C1CCCCC1)C1CCCCC1)OC', 410.5379943847656)),\n",
+       " ((60,\n",
+       "   'Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(C)cc(C)cc2C)c(C)c1',\n",
+       "   388.5350036621094),\n",
+       "  (331408,\n",
+       "   'Cc1cc(C)cc(c1P(c1c(C)cc(cc1C)C)c1c(C)cc(cc1C)C)C',\n",
+       "   388.5350036621094)),\n",
+       " ((241905, '[H]P([H])C', 48.025001525878906),\n",
+       "  (1497, 'CP', 48.025001525878906)),\n",
+       " ((771,\n",
+       "   'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   378.5400085449219),\n",
+       "  (2040,\n",
+       "   'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
+       "   378.5400085449219)),\n",
+       " ((779,\n",
+       "   'c1ccc(-n2cccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+       "   443.614990234375),\n",
+       "  (2041,\n",
+       "   'c1ccc(cc1)n1cccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   443.614990234375)),\n",
+       " ((729, 'CCP(CC)c1ccccc1-n1c2ccccc2c2ccccc21', 331.39898681640625),\n",
+       "  (331422, 'CCP(CC)c1ccccc1n2c3ccccc3c4c2cccc4', 331.39898681640625)),\n",
+       " ((417,\n",
+       "   'CC(C)(C)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+       "   358.54998779296875),\n",
+       "  (2035,\n",
+       "   'CC(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)(C)C',\n",
+       "   358.54998779296875)),\n",
+       " ((783,\n",
+       "   'c1ccc(-n2c(P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)cc3ccccc32)cc1',\n",
+       "   493.67498779296875),\n",
+       "  (2042,\n",
+       "   'c1ccc(cc1)n1c2ccccc2cc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+       "   493.67498779296875)),\n",
+       " ((1201,\n",
+       "   'Cc1cc(C)c(-c2cccc3c2CP(C2CCCCC2)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
+       "   530.7360229492188),\n",
+       "  (2049,\n",
+       "   'Cc1cc(C)cc(c1c1cccc2c1CP(Cc1c2cccc1c1c(C)cc(cc1C)C)C1CCCCC1)C',\n",
+       "   530.7360229492188)),\n",
+       " ((1202,\n",
+       "   'Cc1cc(C)c(-c2cccc3c2CP(C(C)C)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
+       "   490.6709899902344),\n",
+       "  (2050,\n",
+       "   'CC(P1Cc2c(c3c(C1)c(ccc3)c1c(C)cc(cc1C)C)cccc2c1c(C)cc(cc1C)C)C',\n",
+       "   490.6709899902344)),\n",
+       " ((241701, '[H]P([H])C(C)(C)C', 90.10600280761719),\n",
+       "  (1298, 'CC(C)(C)P', 90.10600280761719)),\n",
+       " ((242088, '[H]P([H])[H]', 33.99800109863281), (1299, 'P', 33.99800109863281))]"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "duplicates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "1f1bd039",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the object\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "3c5ac97b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('duplicates.pkl', 'wb') as file:\n",
+    "    pickle.dump(duplicates, file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a187e06d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/duplicates.pkl b/duplicates.pkl
new file mode 100644
index 0000000..a878a6a
Binary files /dev/null and b/duplicates.pkl differ

	molecule_id	smiles	molecular_weight
0	331406	COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC	398.441986
1	140360	COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1	398.532990
2	331409	C1CCC(CC1)P(c1ccccc1)C1CCCCC1	274.388000
3	2027	CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...	497.707001
4	2036	CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...	1049.558960
...	...	...	...
330962	608	Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1	346.410004
330963	461	CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC	376.509003
330964	1064	Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...	462.666992
330965	523	CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...	490.664001
330966	1817	COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC	442.447998