diff --git a/DB_investigation.ipynb b/DB_investigation.ipynb
new file mode 100644
index 0000000..025dc89
--- /dev/null
+++ b/DB_investigation.ipynb
@@ -0,0 +1,798 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9368bb63",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Want to start by checking which molecules are duplicates.\n",
+ "# For examples we have 241905 and 1497"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "e28921c8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import psycopg2\n",
+ "import pandas as pd\n",
+ "from rdkit import Chem"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "214277f9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from tqdm import tqdm"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "a128c9e3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_556/3175014960.py:16: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n",
+ " df = pd.read_sql_query(query, connection)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Establish a connection\n",
+ "# You must have the DB container running to run this cell successfully.\n",
+ "# Connection parameters\n",
+ "db_params = {\n",
+ " 'dbname': 'postgres',\n",
+ " 'user': 'postgres',\n",
+ " 'password': '',\n",
+ " 'host': '127.0.0.1',\n",
+ " 'port': '5432'\n",
+ "}\n",
+ "\n",
+ "# Establish a connection to the PostgreSQL database\n",
+ "connection = psycopg2.connect(**db_params)\n",
+ "\n",
+ "# Execute an SQL statement\n",
+ "query = \"SELECT molecule_id, smiles, molecular_weight FROM molecule\"\n",
+ "df = pd.read_sql_query(query, connection)\n",
+ "\n",
+ "# Close the connection\n",
+ "connection.close()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "686a1a35",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " molecule_id | \n",
+ " smiles | \n",
+ " molecular_weight | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 331406 | \n",
+ " COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC | \n",
+ " 398.441986 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 140360 | \n",
+ " COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1 | \n",
+ " 398.532990 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 331409 | \n",
+ " C1CCC(CC1)P(c1ccccc1)C1CCCCC1 | \n",
+ " 274.388000 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2027 | \n",
+ " CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C... | \n",
+ " 497.707001 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2036 | \n",
+ " CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C... | \n",
+ " 1049.558960 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 330962 | \n",
+ " 608 | \n",
+ " Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1 | \n",
+ " 346.410004 | \n",
+ "
\n",
+ " \n",
+ " 330963 | \n",
+ " 461 | \n",
+ " CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC | \n",
+ " 376.509003 | \n",
+ "
\n",
+ " \n",
+ " 330964 | \n",
+ " 1064 | \n",
+ " Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C... | \n",
+ " 462.666992 | \n",
+ "
\n",
+ " \n",
+ " 330965 | \n",
+ " 523 | \n",
+ " CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C... | \n",
+ " 490.664001 | \n",
+ "
\n",
+ " \n",
+ " 330966 | \n",
+ " 1817 | \n",
+ " COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC | \n",
+ " 442.447998 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
330967 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " molecule_id smiles \\\n",
+ "0 331406 COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC \n",
+ "1 140360 COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1 \n",
+ "2 331409 C1CCC(CC1)P(c1ccccc1)C1CCCCC1 \n",
+ "3 2027 CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C... \n",
+ "4 2036 CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C... \n",
+ "... ... ... \n",
+ "330962 608 Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1 \n",
+ "330963 461 CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC \n",
+ "330964 1064 Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C... \n",
+ "330965 523 CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C... \n",
+ "330966 1817 COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC \n",
+ "\n",
+ " molecular_weight \n",
+ "0 398.441986 \n",
+ "1 398.532990 \n",
+ "2 274.388000 \n",
+ "3 497.707001 \n",
+ "4 1049.558960 \n",
+ "... ... \n",
+ "330962 346.410004 \n",
+ "330963 376.509003 \n",
+ "330964 462.666992 \n",
+ "330965 490.664001 \n",
+ "330966 442.447998 \n",
+ "\n",
+ "[330967 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "149e8d9b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " molecule_id | \n",
+ " smiles | \n",
+ " molecular_weight | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 143223 | \n",
+ " 241905 | \n",
+ " [H]P([H])C | \n",
+ " 48.025002 | \n",
+ "
\n",
+ " \n",
+ " 329868 | \n",
+ " 1497 | \n",
+ " CP | \n",
+ " 48.025002 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " molecule_id smiles molecular_weight\n",
+ "143223 241905 [H]P([H])C 48.025002\n",
+ "329868 1497 CP 48.025002"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Sanity check to see if the data is correct\n",
+ "df[(df[\"molecule_id\"]==241905) | (df[\"molecule_id\"]==1497)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "9596cb12",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# These two molecules are the same so lets check if rdkit will return the same smiles string when canonicalizing them\n",
+ "mol_241905 = df[df[\"molecule_id\"]==241905][\"smiles\"].to_list()[0]\n",
+ "mol_1497 = df[df[\"molecule_id\"]==1497][\"smiles\"].to_list()[0]\n",
+ "\n",
+ "# Double check with the molecular weight. Use difference is less than some tolerance 1e-6.\n",
+ "mol_241905_weight = df[df[\"molecule_id\"]==241905][\"molecular_weight\"].to_list()[0]\n",
+ "mol_1497_weight = df[df[\"molecule_id\"]==1497][\"molecular_weight\"].to_list()[0]\n",
+ "\n",
+ "a = Chem.CanonSmiles(mol_241905)\n",
+ "b = Chem.CanonSmiles(mol_241905)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "9c94d293",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CP 48.025001525878906\n",
+ "CP 48.025001525878906\n",
+ "True\n",
+ "True\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(a, mol_241905_weight)\n",
+ "print(b, mol_1497_weight)\n",
+ "print(a == b)\n",
+ "print(abs(mol_241905_weight - mol_1497_weight) < 0.000001)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "5cca5c81",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "all_data_dict = df.to_dict(orient=\"records\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "7ea5a56b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 0%| | 0/330967 [00:00, ?it/s][23:43:15] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
+ " 0%|▏ | 653/330967 [00:00<01:38, 3339.89it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "c1ccc(cc1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)c1ccccc1\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 1%|▌ | 2331/330967 [00:00<01:20, 4099.64it/s][23:43:15] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
+ " 1%|▋ | 3150/330967 [00:00<01:24, 3897.22it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "c1ccc(cc1)P([C]1234[BH]567[BH]891[BH]1%103[BH]3%112[BH]245[BH]456[CH]678[BH]791[BH]1%10%11[BH]324[BH]5671)c1ccccc1\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ " 2%|█▎ | 5580/330967 [00:01<01:22, 3940.69it/s][23:43:16] Explicit valence for atom # 7 B, 6, is greater than permitted\n",
+ "[23:43:16] Explicit valence for atom # 7 C, 6, is greater than permitted\n",
+ " 2%|█▍ | 6369/330967 [00:01<01:23, 3905.70it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "C1CCC(CC1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)C1CCCCC1\n",
+ "c1ccc(cc1)P([C]1234[CH]567[BH]893[BH]3%102[BH]2%111[BH]145[BH]456[BH]678[BH]79%10[BH]83%11[BH]214[BH]5678)c1ccccc1\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|█████████████████████████████████████████████████████████████████████████| 330967/330967 [02:02<00:00, 2711.21it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Store each record in a hashmap with the CANONICAL smile as the key, and value as a tuple of molecule_id and molecular_weight\n",
+ "# When storing, check to see if the CANONICAL smile is already there, if so check its weight and add the tuple of molecule_ids\n",
+ "# the original \n",
+ "hashmap = {}\n",
+ "duplicates = []\n",
+ "for entry in tqdm(all_data_dict):\n",
+ " try:\n",
+ " # Canonicalize smiles\n",
+ " canonical_smile = Chem.CanonSmiles(entry[\"smiles\"])\n",
+ " # Check if its in hashmap\n",
+ " if canonical_smile not in hashmap:\n",
+ " # add it\n",
+ " hashmap[canonical_smile] = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
+ " else:\n",
+ " a = hashmap[canonical_smile] # Get matched molecule tuple data\n",
+ " b = (entry[\"molecule_id\"], entry[\"smiles\"], entry[\"molecular_weight\"])\n",
+ " duplicates.append((a, b))\n",
+ " except:\n",
+ " # See which molecules cause issues if any, deal with them later\n",
+ " print(entry[\"smiles\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "361ec9af",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "78"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(duplicates)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "158c8d2c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[((1519, 'COP', 64.02400207519531),\n",
+ " (241851, '[H]P([H])OC', 64.02400207519531)),\n",
+ " ((241739, '[H]P([H])C(C)C', 76.0790023803711),\n",
+ " (1495, 'CC(C)P', 76.0790023803711)),\n",
+ " ((2045, 'C1CC2CCCC(C1)P2B1Nc2ccccc2c2c1cccc2', 319.1969909667969),\n",
+ " (1006, 'c1ccc2c(c1)NB(P1C3CCCC1CCC3)c1ccccc1-2', 319.1969909667969)),\n",
+ " ((1521, 'CSP', 80.09200286865234),\n",
+ " (241909, '[H]P([H])SC', 80.09200286865234)),\n",
+ " ((2063,\n",
+ " 'COC1=CC=C(OC)C(C2=C(C(C)C)C=C(C(C)C)C=C2C(C)C)=C1P3C(C)(C)CC4(OCCO4)CC3(C)C',\n",
+ " 554.7520141601562),\n",
+ " (1900,\n",
+ " 'COc1ccc(OC)c(P2C(C)(C)CC3(CC2(C)C)OCCO3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
+ " 554.7520141601562)),\n",
+ " ((1494, 'CC(C)PC(C)C', 118.16000366210938),\n",
+ " (41951, '[H]P(C(C)C)C(C)C', 118.16000366210938)),\n",
+ " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
+ " (1296, 'PC12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
+ " ((820,\n",
+ " 'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@]57)OC8=[C@]7C(C=CC=C9)=C9C=C8',\n",
+ " 598.6380004882812),\n",
+ " (821,\n",
+ " 'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@@]57)OC8=[C@@]7C(C=CC=C9)=C9C=C8',\n",
+ " 598.6380004882812)),\n",
+ " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
+ " (1295, 'C1C2CC3CC1CC(PC14CC5CC(CC(C5)C1)C4)(C2)C3', 302.4419860839844)),\n",
+ " ((213971, '[H]P(F)F', 69.97799682617188), (1293, 'FPF', 69.97799682617188)),\n",
+ " ((150328, '[H]P([Si](C)(C)C)[Si](C)(C)C', 178.36399841308594),\n",
+ " (1520, 'C[Si](C)(C)P[Si](C)(C)C', 178.36399841308594)),\n",
+ " ((2020,\n",
+ " 'c1ccc(cc1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+ " 454.63800048828125),\n",
+ " (306,\n",
+ " 'c1ccc(-c2ccccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+ " 454.63800048828125)),\n",
+ " ((2053, 'C1C2CC3CC1CC(C2)(C3)PC12CC3CC(C2)CC(C1)C3', 302.4419860839844),\n",
+ " (252420,\n",
+ " '[H]P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+ " 302.4419860839844)),\n",
+ " ((1491, 'Pc1ccco1', 100.05699920654297),\n",
+ " (242228, '[H]P([H])c1ccco1', 100.05699920654297)),\n",
+ " ((2033,\n",
+ " 'CCCCc1c(F)c(F)c(c(c1F)F)c1c(cc(c(c1C(C)C)c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC)C(C)C)C(C)C',\n",
+ " 815.072998046875),\n",
+ " (369,\n",
+ " 'CCCCc1c(F)c(F)c(-c2c(C(C)C)cc(C(C)C)c(-c3cccc(OC)c3P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)c2C(C)C)c(F)c1F',\n",
+ " 815.072998046875)),\n",
+ " ((1297, 'CC(C)(C)PC(C)(C)C', 146.21400451660156),\n",
+ " (20221, '[H]P(C(C)(C)C)C(C)(C)C', 146.21400451660156)),\n",
+ " ((2044, 'CC(P(C(C)(C)C)C1C2CC3CC1CC(C2)C3)(C)C', 280.4360046386719),\n",
+ " (952, 'CC(C)(C)P(C1C2CC3CC(C2)CC1C3)C(C)(C)C', 280.4360046386719)),\n",
+ " ((331415, 'P(CCCCCCCC)(CCCCCCCC)CCCCCCCC', 370.64599609375),\n",
+ " (239, 'CCCCCCCCP(CCCCCCCC)CCCCCCCC', 370.64599609375)),\n",
+ " ((1493, 'CCP', 62.051998138427734),\n",
+ " (241815, '[H]P([H])CC', 62.051998138427734)),\n",
+ " ((1518, 'CN(C)PN(C)C', 120.13600158691406),\n",
+ " (91817, '[H]P(N(C)C)N(C)C', 120.13600158691406)),\n",
+ " ((1294, 'FP', 51.987998962402344),\n",
+ " (242039, '[H]P([H])F', 51.987998962402344)),\n",
+ " ((2038, 'C1C2CC3CC1CC(C2)(C3)P(c1ccccc1)c1ccccc1', 320.4159851074219),\n",
+ " (576, 'c1ccc(P(c2ccccc2)C23CC4CC(CC(C4)C2)C3)cc1', 320.4159851074219)),\n",
+ " ((1487, 'Pc1ccc2c(c1)CCCC2', 164.18800354003906),\n",
+ " (242188, '[H]P([H])c1ccc2c(c1)CCCC2', 164.18800354003906)),\n",
+ " ((2034,\n",
+ " 'O1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+ " 463.64599609375),\n",
+ " (371,\n",
+ " 'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCOCC2)c1',\n",
+ " 463.64599609375)),\n",
+ " ((331412, 'CC(C=CC=C1C)=C1P(C2=CC=CC=C2)C3=CC=CC=C3', 290.34600830078125),\n",
+ " (203, 'Cc1cccc(C)c1P(c1ccccc1)c1ccccc1', 290.34600830078125)),\n",
+ " ((2043, 'CC(P(C12CC3CC(C2)CC(C1)C3)C(C)(C)C)(C)C', 280.4360046386719),\n",
+ " (951, 'CC(C)(C)P(C(C)(C)C)C12CC3CC(CC(C3)C1)C2', 280.4360046386719)),\n",
+ " ((1492, 'CCPCC', 90.10600280761719),\n",
+ " (85517, '[H]P(CC)CC', 90.10600280761719)),\n",
+ " ((1488, 'PC12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281),\n",
+ " (242107, '[H]P([H])C12c3ccccc3C(c3ccccc31)c1ccccc12', 286.3139953613281)),\n",
+ " ((2064,\n",
+ " 'COC1=CC=C(C2=C(OC)C=CC3=C2C=CC=C3)C(OC)=C1P(C4CCCCC4)C5CCCCC5',\n",
+ " 490.6239929199219),\n",
+ " (1901,\n",
+ " 'COc1ccc(-c2c(OC)ccc3ccccc23)c(OC)c1P(C1CCCCC1)C1CCCCC1',\n",
+ " 490.6239929199219)),\n",
+ " ((331406, 'COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC', 398.4419860839844),\n",
+ " (7, 'COc1cccc(OC)c1-c1ccccc1P(c1ccccc1)c1ccccc1', 398.4419860839844)),\n",
+ " ((1485, 'Pc1ccc2ccccc2c1', 160.156005859375),\n",
+ " (242192, '[H]P([H])c1ccc2ccccc2c1', 160.156005859375)),\n",
+ " ((1496, 'CPC', 62.051998138427734), (137130, '[H]P(C)C', 62.051998138427734)),\n",
+ " ((2052,\n",
+ " 'CC(c1cccc(c1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C(C)C)C',\n",
+ " 462.7019958496094),\n",
+ " (1274,\n",
+ " 'CC(C)c1cccc(C(C)C)c1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+ " 462.7019958496094)),\n",
+ " ((2025,\n",
+ " 'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1cccc2c1cccc2',\n",
+ " 456.6099853515625),\n",
+ " (348,\n",
+ " 'O=C(c1cccc2ccccc12)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+ " 456.6099853515625)),\n",
+ " ((575, 'Pc1ccccc1', 110.09600067138672),\n",
+ " (242207, '[H]P([H])c1ccccc1', 110.09600067138672)),\n",
+ " ((2026,\n",
+ " 'CC([Si](C(C)C)(C(C)C)Oc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+ " 550.8839721679688),\n",
+ " (349,\n",
+ " 'CC(C)[Si](Oc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2)(C(C)C)C(C)C',\n",
+ " 550.8839721679688)),\n",
+ " ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),\n",
+ " (242106, '[H]P([H])C12CC3CC(CC(C3)C1)C2', 168.22000122070312)),\n",
+ " ((2048, 'COCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3', 360.5220031738281),\n",
+ " (1140,\n",
+ " 'COCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+ " 360.5220031738281)),\n",
+ " ((2051, 'O=Cc1ccc(cc1)P(c1ccc(cc1)C=O)C1C2CC3CC1CC(C2)C3', 376.4360046386719),\n",
+ " (1241,\n",
+ " 'O=Cc1ccc(P(c2ccc(C=O)cc2)C2C3CC4CC(C3)CC2C4)cc1',\n",
+ " 376.4360046386719)),\n",
+ " ((2024,\n",
+ " 'COc1ccc(c(c1c1c(cc(cc1C(C)C)C(C)C)C(C)C)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)OC',\n",
+ " 640.9329833984375),\n",
+ " (347,\n",
+ " 'COc1ccc(OC)c(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',\n",
+ " 640.9329833984375)),\n",
+ " ((2047,\n",
+ " 'COc1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+ " 408.5660095214844),\n",
+ " (1137,\n",
+ " 'COc1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+ " 408.5660095214844)),\n",
+ " ((2016,\n",
+ " 'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)Cc1ccccc1',\n",
+ " 392.5669860839844),\n",
+ " (64,\n",
+ " 'c1ccc(CP(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+ " 392.5669860839844)),\n",
+ " ((2022,\n",
+ " 'c1ccc(cc1)c1nn(c(c1n1nccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1)c1ccccc1',\n",
+ " 662.8619995117188),\n",
+ " (340,\n",
+ " 'c1ccc(-c2nn(-c3ccccc3)c(-c3ccccc3)c2-n2nccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+ " 662.8619995117188)),\n",
+ " ((2014,\n",
+ " 'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+ " 436.66400146484375),\n",
+ " (6,\n",
+ " 'C1C2CC3CC1CC(P(C14CC5CC(CC(C5)C1)C4)C14CC5CC(CC(C5)C1)C4)(C2)C3',\n",
+ " 436.66400146484375)),\n",
+ " ((2037,\n",
+ " 'CN(c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+ " 421.6090087890625),\n",
+ " (527,\n",
+ " 'CN(C)c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+ " 421.6090087890625)),\n",
+ " ((1490, 'Pc1cccs1', 116.125), (242229, '[H]P([H])c1cccs1', 116.125)),\n",
+ " ((331421, 'CCO[Si](OCC)(OCC)CCCP(c1ccccc1)c2ccccc2', 390.5360107421875),\n",
+ " (728, 'CCO[Si](CCCP(c1ccccc1)c1ccccc1)(OCC)OCC', 390.5360107421875)),\n",
+ " ((1486, 'Pc1cccc2ccccc12', 160.156005859375),\n",
+ " (242206, '[H]P([H])c1cccc2ccccc12', 160.156005859375)),\n",
+ " ((2021,\n",
+ " 'C1CCN(CC1)c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+ " 461.67401123046875),\n",
+ " (338,\n",
+ " 'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)c(N2CCCCC2)c1',\n",
+ " 461.67401123046875)),\n",
+ " ((2015,\n",
+ " 'CCCCP(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+ " 358.54998779296875),\n",
+ " (10, 'CCCCP(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2', 358.54998779296875)),\n",
+ " ((2065,\n",
+ " 'C12=CC=CC=C1C(C3=C(C4=C(P(C5CCCCC5)C6CCCCC6)C=CC=C4)C(C=CC=C7)=C7C=C3)=C8C(C=CC=C8)=C2',\n",
+ " 576.7639770507812),\n",
+ " (1902,\n",
+ " 'c1ccc(P(C2CCCCC2)C2CCCCC2)c(-c2c(-c3c4ccccc4cc4ccccc34)ccc3ccccc23)c1',\n",
+ " 576.7639770507812)),\n",
+ " ((2027,\n",
+ " 'CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)C',\n",
+ " 497.7070007324219),\n",
+ " (350,\n",
+ " 'CN(C)c1ccccc1-c1ccccc1P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+ " 497.7070007324219)),\n",
+ " ((331409, 'C1CCC(CC1)P(c1ccccc1)C1CCCCC1', 274.38800048828125),\n",
+ " (68, 'c1ccc(P(C2CCCCC2)C2CCCCC2)cc1', 274.38800048828125)),\n",
+ " ((2017,\n",
+ " 'O=C(P(C12CC3CC(C2)CC(C1)C3)C(=O)Nc1ccccc1)Nc1ccccc1',\n",
+ " 406.46600341796875),\n",
+ " (259,\n",
+ " 'O=C(Nc1ccccc1)P(C(=O)Nc1ccccc1)C12CC3CC(CC(C3)C1)C2',\n",
+ " 406.46600341796875)),\n",
+ " ((2023,\n",
+ " 'O=C(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
+ " 406.54998779296875),\n",
+ " (346,\n",
+ " 'O=C(c1ccccc1)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+ " 406.54998779296875)),\n",
+ " ((574, 'c1ccc(Pc2ccccc2)cc1', 186.19400024414062),\n",
+ " (310083, '[H]P(c1ccccc1)c1ccccc1', 186.19400024414062)),\n",
+ " ((42, 'CC(C)(C)P(c1ccccc1-c1ccccc1)C(C)(C)C', 298.4100036621094),\n",
+ " (331407, 'CC(P(C(C)(C)C)c1ccccc1c1ccccc1)(C)C', 298.4100036621094)),\n",
+ " ((103,\n",
+ " 'COc1ccc(C)c(-c2c(C(C)C)cc(C(C)C)cc2C(C)C)c1P(C(C)(C)C)C(C)(C)C',\n",
+ " 468.70599365234375),\n",
+ " (331410,\n",
+ " 'COc1ccc(c(c1P(C(C)(C)C)C(C)(C)C)c1c(cc(cc1C(C)C)C(C)C)C(C)C)C',\n",
+ " 468.70599365234375)),\n",
+ " ((116, 'CN(C)c1ccc(P(c2ccccc2)c2ccccc2)cc1', 305.3609924316406),\n",
+ " (331411, 'CN(c1ccc(cc1)P(c1ccccc1)c1ccccc1)C', 305.3609924316406)),\n",
+ " ((246,\n",
+ " 'COc1c(C)cc(P(c2cc(C)c(OC)c(C)c2)c2cc(C)c(OC)c(C)c2)cc1C',\n",
+ " 436.5320129394531),\n",
+ " (331416,\n",
+ " 'P(c1cc(c(c(c1)C)OC)C)(c1cc(c(c(c1)C)OC)C)c1cc(c(c(c1)C)OC)C',\n",
+ " 436.5320129394531)),\n",
+ " ((234,\n",
+ " 'Fc1c(F)c(F)c(P(c2ccccc2)c2c(F)c(F)c(F)c(F)c2F)c(F)c1F',\n",
+ " 442.1919860839844),\n",
+ " (331414,\n",
+ " 'Fc1c(c(c(c(c1F)F)P(c1c(c(c(c(c1F)F)F)F)F)c1ccccc1)F)F',\n",
+ " 442.1919860839844)),\n",
+ " ((356, 'C/C=C/CP(C(C)(C)C)C(C)(C)C', 200.30599975585938),\n",
+ " (331419, 'P(C(C)(C)C)(C(C)(C)C)C/C=C/C', 200.30599975585938)),\n",
+ " ((487,\n",
+ " 'FC(F)(F)C(F)(F)C(F)(F)C(Cc1ccc(P(c2ccccc2)c2ccc(CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)cc2)cc1)(C(F)(F)F)C(F)(F)F',\n",
+ " 926.4099731445312),\n",
+ " (331420,\n",
+ " 'FC(F)(F)C(F)(F)C(F)(F)C(C(F)(F)F)(C(F)(F)F)Cc1ccc(cc1)P(c3ccc(cc3)CC(C(F)(F)F)(C(F)(F)F)C(F)(F)C(F)(F)C(F)(F)F)c2ccccc2',\n",
+ " 926.4099731445312)),\n",
+ " ((216, 'CN(C)c1ccc(P(C(C)(C)C)C(C)(C)C)cc1', 265.3810119628906),\n",
+ " (331413, 'CC(P(C1=CC=C(N(C)C)C=C1)C(C)(C)C)(C)C', 265.3810119628906)),\n",
+ " ((298, 'c1ccc(P2Cc3cccc4c3C3(CC4)CCc4cccc(c43)C2)cc1', 354.4330139160156),\n",
+ " (331417, 'P1(Cc2c3c(ccc2)CCC23CCc3c2c(ccc3)C1)c1ccccc1', 354.4330139160156)),\n",
+ " ((320, 'COc1cc(OC)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2)c(OC)c1', 440.5639953613281),\n",
+ " (331418,\n",
+ " 'P(C1CCCCC1)(C1CCCCC1)c1c(cccc1)c1c(cc(cc1OC)OC)OC',\n",
+ " 440.5639953613281)),\n",
+ " ((3, 'COc1cccc(OC)c1-c1ccccc1P(C1CCCCC1)C1CCCCC1', 410.5379943847656),\n",
+ " (331405, 'COc1cccc(c1c1ccccc1P(C1CCCCC1)C1CCCCC1)OC', 410.5379943847656)),\n",
+ " ((60,\n",
+ " 'Cc1cc(C)c(P(c2c(C)cc(C)cc2C)c2c(C)cc(C)cc2C)c(C)c1',\n",
+ " 388.5350036621094),\n",
+ " (331408,\n",
+ " 'Cc1cc(C)cc(c1P(c1c(C)cc(cc1C)C)c1c(C)cc(cc1C)C)C',\n",
+ " 388.5350036621094)),\n",
+ " ((241905, '[H]P([H])C', 48.025001525878906),\n",
+ " (1497, 'CP', 48.025001525878906)),\n",
+ " ((771,\n",
+ " 'c1ccc(P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+ " 378.5400085449219),\n",
+ " (2040,\n",
+ " 'C1C2CC3CC1CC(C2)(C3)P(C12CC3CC(C2)CC(C1)C3)c1ccccc1',\n",
+ " 378.5400085449219)),\n",
+ " ((779,\n",
+ " 'c1ccc(-n2cccc2P(C23CC4CC(CC(C4)C2)C3)C23CC4CC(CC(C4)C2)C3)cc1',\n",
+ " 443.614990234375),\n",
+ " (2041,\n",
+ " 'c1ccc(cc1)n1cccc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+ " 443.614990234375)),\n",
+ " ((729, 'CCP(CC)c1ccccc1-n1c2ccccc2c2ccccc21', 331.39898681640625),\n",
+ " (331422, 'CCP(CC)c1ccccc1n2c3ccccc3c4c2cccc4', 331.39898681640625)),\n",
+ " ((417,\n",
+ " 'CC(C)(C)P(C12CC3CC(CC(C3)C1)C2)C12CC3CC(CC(C3)C1)C2',\n",
+ " 358.54998779296875),\n",
+ " (2035,\n",
+ " 'CC(P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3)(C)C',\n",
+ " 358.54998779296875)),\n",
+ " ((783,\n",
+ " 'c1ccc(-n2c(P(C34CC5CC(CC(C5)C3)C4)C34CC5CC(CC(C5)C3)C4)cc3ccccc32)cc1',\n",
+ " 493.67498779296875),\n",
+ " (2042,\n",
+ " 'c1ccc(cc1)n1c2ccccc2cc1P(C12CC3CC(C2)CC(C1)C3)C12CC3CC(C2)CC(C1)C3',\n",
+ " 493.67498779296875)),\n",
+ " ((1201,\n",
+ " 'Cc1cc(C)c(-c2cccc3c2CP(C2CCCCC2)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
+ " 530.7360229492188),\n",
+ " (2049,\n",
+ " 'Cc1cc(C)cc(c1c1cccc2c1CP(Cc1c2cccc1c1c(C)cc(cc1C)C)C1CCCCC1)C',\n",
+ " 530.7360229492188)),\n",
+ " ((1202,\n",
+ " 'Cc1cc(C)c(-c2cccc3c2CP(C(C)C)Cc2c-3cccc2-c2c(C)cc(C)cc2C)c(C)c1',\n",
+ " 490.6709899902344),\n",
+ " (2050,\n",
+ " 'CC(P1Cc2c(c3c(C1)c(ccc3)c1c(C)cc(cc1C)C)cccc2c1c(C)cc(cc1C)C)C',\n",
+ " 490.6709899902344)),\n",
+ " ((241701, '[H]P([H])C(C)(C)C', 90.10600280761719),\n",
+ " (1298, 'CC(C)(C)P', 90.10600280761719)),\n",
+ " ((242088, '[H]P([H])[H]', 33.99800109863281), (1299, 'P', 33.99800109863281))]"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "duplicates"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "1f1bd039",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Save the object\n",
+ "import pickle"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "3c5ac97b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('duplicates.pkl', 'wb') as file:\n",
+ " pickle.dump(duplicates, file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a187e06d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/duplicates.pkl b/duplicates.pkl
new file mode 100644
index 0000000..a878a6a
Binary files /dev/null and b/duplicates.pkl differ