-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSDF2lmdb.py
105 lines (95 loc) · 4.11 KB
/
SDF2lmdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import multiprocessing
from rdkit import Chem
from tqdm import tqdm
import pickle
import lmdb
import numpy as np
import os
from rdkit.Chem import AllChem
Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)
def write_lmdb(data, lmdb_path, num):
#resume
env = lmdb.open(lmdb_path, subdir=False, readonly=False, lock=False, readahead=False, meminit=False, map_size=1099511627776)
with env.begin(write=True) as txn:
for d in data:
txn.put(str(num).encode('ascii'), pickle.dumps(d))
num += 1
return num
def gen_conf(mol):
try:
# Generate a single conformation for the molecule
mol = Chem.AddHs(mol)
AllChem.EmbedMultipleConfs(mol, numConfs=1,numThreads=1,pruneRmsThresh=1,maxAttempts=1000,useRandomCoords=False)
AllChem.MMFFOptimizeMoleculeConfs(mol, numThreads=1)
mol = Chem.RemoveHs(mol)
if mol.GetNumConformers() == 0:
return None
else:
return {'coordinates': [np.array(mol.GetConformer(i).GetPositions()) for i in range(mol.GetNumConformers())], 'atoms': [a.GetSymbol() for a in mol.GetAtoms()], 'smi':Chem.MolToSmiles(mol), 'IDs':getID(mol)}
except Exception as e:
# Handle any errors that occur during processing
print(f"Error processing molecule: {e}")
return None
def getID(mol):
for id_key in ['IDNUMBER','Catalog ID','ID']:
try:
return mol.GetProp(id_key)
except:
pass
raise ValueError('No ID found in molecule')
def process_sdf_file(sdf_file,n_cpu=32):
subset = os.path.basename(sdf_file).split('.')[0]
# Read the SDF file and convert all molecules to RDKit mol objects
suppl = Chem.SDMolSupplier(sdf_file)
molecules = [mol for mol in suppl if mol is not None]
#test if the sdf file can be identified
try:
getID(molecules[0])
except:
raise ValueError('No ID found in molecule')
# Use multiple processes to process the molecules in parallel,use tqdm to show progress
with multiprocessing.Pool(n_cpu) as pool:
processed_molecules = list(tqdm(pool.imap(gen_conf, molecules), total=len(molecules)))
# Discard errored molecules
mol_data = [mol for mol in processed_molecules if mol is not None]
[mol.update({'subset':subset}) for mol in mol_data]
return mol_data
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('sdf_dir', type=str, help='a directory containing SDF files')
parser.add_argument('lmdb_path', type=str, help='output lmdb path')
args = parser.parse_args()
home = args.sdf_dir
output = args.lmdb_path
num = 0
#list home, sort all files from smallest to largest
filelist = os.listdir(home)
filelist.sort(key=lambda x: os.path.getsize(os.path.join(home, x)))
for f in filelist:
mol_data = process_sdf_file(os.path.join(home,f))
num = write_lmdb(mol_data,output, num)
print(f'Finished processing {f}, {len(mol_data)} molecules added, currently {num} molecules in the LMDB.')
"""
{'ChemBridge_CORE_Library_Stock_Part1_202312': 439762,
'ChemBridge_CORE_Library_Stock_Part2_202312': 431710,
'ChemBridge_EXPRESS_Library_Stock_202312': 501317,
'ChemDiv_3D_Biodiversity_Library': 27658,
'ChemDiv_3D_Diversity_Natural_Product_Like_Library': 17653,
'ChemDiv_3D_Pharm_Dversity_Library': 47486,
'ChemDiv_BMS_300k': 299963,
'ChemDiv_Fast_follow_up_SAR_library': 155636,
'ChemDiv_MCE_18_Trends_Medicinal_Chemistry_Library': 50789,
'ChemDiv_SmartTM_Library': 50213,
'ChemDiv_Soluble_Diversity_Library': 15496,
'ChemDiv_Targeted_Diversity_Library': 39646,
'ChemDiv_diversity_100k': 100000,
'ChemDiv_diversity_150k': 149981,
'ChemDiv_diversity_50k': 50000,
'Enamine_Hit_Locator_Library_HLL460k_20220221': 460129,
'LC_10k_Pre_Plated_Diversity_Set_PS6': 9920,
'LC_15k_Pre_Plated_Diversity_Set_PS4': 15040,
'LC_15k_Pre_Plated_Diversity_Set_PS5': 15040,
'LC_15k_Pre_Plated_Diversity_Set_PS6': 15040,
'LC_50k_Pre_Plated_Diversity_Set_PS7': 50240}
"""