diff --git a/DB_investigation.ipynb b/DB_investigation.ipynb new file mode 100644 index 0000000..025dc89 --- /dev/null +++ b/DB_investigation.ipynb @@ -0,0 +1,798 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "9368bb63", + "metadata": {}, + "outputs": [], + "source": [ + "# Want to start by checking which molecules are duplicates.\n", + "# For examples we have 241905 and 1497" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e28921c8", + "metadata": {}, + "outputs": [], + "source": [ + "import psycopg2\n", + "import pandas as pd\n", + "from rdkit import Chem" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "214277f9", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a128c9e3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_556/3175014960.py:16: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql_query(query, connection)\n" + ] + } + ], + "source": [ + "# Establish a connection\n", + "# You must have the DB container running to run this cell successfully.\n", + "# Connection parameters\n", + "db_params = {\n", + " 'dbname': 'postgres',\n", + " 'user': 'postgres',\n", + " 'password': '',\n", + " 'host': '127.0.0.1',\n", + " 'port': '5432'\n", + "}\n", + "\n", + "# Establish a connection to the PostgreSQL database\n", + "connection = psycopg2.connect(**db_params)\n", + "\n", + "# Execute an SQL statement\n", + "query = \"SELECT molecule_id, smiles, molecular_weight FROM molecule\"\n", + "df = pd.read_sql_query(query, connection)\n", + "\n", + "# Close the connection\n", + "connection.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "686a1a35", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
molecule_idsmilesmolecular_weight
0331406COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC398.441986
1140360COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1398.532990
2331409C1CCC(CC1)P(c1ccccc1)C1CCCCC1274.388000
32027CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...497.707001
42036CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...1049.558960
............
330962608Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1346.410004
330963461CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC376.509003
3309641064Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...462.666992
330965523CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...490.664001
3309661817COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC442.447998
\n", + "

330967 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " molecule_id smiles \\\n", + "0 331406 COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC \n", + "1 140360 COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1 \n", + "2 331409 C1CCC(CC1)P(c1ccccc1)C1CCCCC1 \n", + "3 2027 CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C... \n", + "4 2036 CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C... \n", + "... ... ... \n", + "330962 608 Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1 \n", + "330963 461 CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC \n", + "330964 1064 Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C... \n", + "330965 523 CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C... \n", + "330966 1817 COc1cccc(P(c2cccc(OC)c2OC)c2cccc(OC)c2OC)c1OC \n", + "\n", + " molecular_weight \n", + "0 398.441986 \n", + "1 398.532990 \n", + "2 274.388000 \n", + "3 497.707001 \n", + "4 1049.558960 \n", + "... ... \n", + "330962 346.410004 \n", + "330963 376.509003 \n", + "330964 462.666992 \n", + "330965 490.664001 \n", + "330966 442.447998 \n", + "\n", + "[330967 rows x 3 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "149e8d9b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
molecule_idsmilesmolecular_weight
143223241905[H]P([H])C48.025002
3298681497CP48.025002
\n", + "
" + ], + "text/plain": [ + " molecule_id smiles molecular_weight\n", + "143223 241905 [H]P([H])C 48.025002\n", + "329868 1497 CP 48.025002" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Sanity check to see if the data is correct\n", + "df[(df[\"molecule_id\"]==241905) | (df[\"molecule_id\"]==1497)]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "9596cb12", + "metadata": {}, + "outputs": [], + "source": [ + "# These two molecules are the same so lets check if rdkit will return the same smiles string when canonicalizing them\n", + "mol_241905 = df[df[\"molecule_id\"]==241905][\"smiles\"].to_list()[0]\n", + "mol_1497 = df[df[\"molecule_id\"]==1497][\"smiles\"].to_list()[0]\n", + "\n", + "# Double check with the molecular weight. Use difference is less than some tolerance 1e-6.\n", + "mol_241905_weight = df[df[\"molecule_id\"]==241905][\"molecular_weight\"].to_list()[0]\n", + "mol_1497_weight = df[df[\"molecule_id\"]==1497][\"molecular_weight\"].to_list()[0]\n", + "\n", + "a = Chem.CanonSmiles(mol_241905)\n", + "b = Chem.CanonSmiles(mol_241905)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "9c94d293", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CP 48.025001525878906\n", + "CP 48.025001525878906\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "print(a, mol_241905_weight)\n", + "print(b, mol_1497_weight)\n", + "print(a == b)\n", + "print(abs(mol_241905_weight - mol_1497_weight) < 0.000001)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "5cca5c81", + "metadata": {}, + "outputs": [], + "source": [ + "all_data_dict = df.to_dict(orient=\"records\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "7ea5a56b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/330967 [00:00