This repository has been archived by the owner on Apr 26, 2023. It is now read-only.
forked from zhenglz/onionnet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_features.py
331 lines (257 loc) · 10.1 KB
/
generate_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#!/usr/bin/env python
import numpy as np
import pandas as pd
import mdtraj as mt
import math
import itertools
import sys
from collections import OrderedDict
import argparse
from argparse import RawDescriptionHelpFormatter
import multiprocessing as mp
ALL_ELEMENTS = ["H", "C", "O", "N", "P", "S", "HAX", "DU"]
class AtomTypeCounts(object):
"""Featurization of Protein-Ligand Complex based on
onion-shape distance counts of atom-types.
Parameters
----------
pdb_fn : str
The input pdb file name.
lig_code : str
The ligand residue name in the input pdb file.
Attributes
----------
pdb : mdtraj.Trajectory
The mdtraj.trajectory object containing the pdb.
receptor_indices : np.ndarray
The receptor (protein) atom indices in mdtraj.Trajectory
ligand_indices : np.ndarray
The ligand (protein) atom indices in mdtraj.Trajectory
rec_ele : np.ndarray
The element types of each of the atoms in the receptor
lig_ele : np.ndarray
The element types of each of the atoms in the ligand
lig_code : str
The ligand residue name in the input pdb file
pdb_parsed_ : bool
Whether the pdb file has been parsed.
distance_computed : bool
Whether the distances between atoms in receptor and ligand has been computed.
distance_matrix_ : np.ndarray, shape = [ N1 * N2, ]
The distances between all atom pairs
N1 and N2 are the atom numbers in receptor and ligand respectively.
counts_: np.ndarray, shape = [ N1 * N2, ]
The contact numbers between all atom pairs
N1 and N2 are the atom numbers in receptor and ligand respectively.
"""
def __init__(self, pdb_fn, lig_code):
self.pdb = mt.load_pdb(pdb_fn)
self.receptor_indices = np.array([])
self.ligand_indices = np.array([])
self.rec_ele = np.array([])
self.lig_ele = np.array([])
self.lig_code = lig_code
self.pdb_parsed_ = False
self.distance_computed_ = False
self.distance_matrix_ = np.array([])
self.counts_ = np.array([])
def parsePDB(self, rec_sele="protein", lig_sele="UNK"):
"""
Parse PDB file using mdtraj
Parameters
----------
rec_sele: str, default is protein.
The topology selection for the receptor
lig_sele: str, default is resname LIG
The topology selection for the ligand
Returns
-------
self: an instance of itself
"""
top = self.pdb.topology
self.receptor_indices = top.select(rec_sele)
self.ligand_indices = top.select("resname " + lig_sele)
table, bond = top.to_dataframe()
self.rec_ele = table['element'][self.receptor_indices]
self.lig_ele = table['element'][self.ligand_indices]
self.pdb_parsed_ = True
return self
def distance_pairs(self):
"""Calculate all distance pairs between atoms in the receptor and in the ligand
Returns
-------
self: an instance of itself
"""
if not self.pdb_parsed_:
self.parsePDB()
# all combinations of the atom indices from the receptor and the ligand
all_pairs = itertools.product(self.receptor_indices, self.ligand_indices)
# if distance matrix is not calculated
if not self.distance_computed_:
self.distance_matrix_ = mt.compute_distances(self.pdb, atom_pairs=all_pairs)[0]
self.distance_computed_ = True
return self
def switchFuction(self, x, d0=7.0, m=12, n=6):
"""Implement a rational switch function
to enable a smooth transition
Parameters
----------
x : float
the input distance value
d0 : float
the distance cutoff, it is usually 2 times of
the real distance cutoff
m : int
the exponential index of higher order
n : int
the exponential index of lower order
Returns
-------
switched_dist : float
the switched continuous distance
Notes
-----
the function is like:
s= [1 - (x/d0)^6] / [1 - (x/d0)^12]
d0 is a cutoff, should be twice larger than the distance cutoff
"""
count = 0.0
try:
count = (1.0 - math.pow((x / d0), n)) / (1.0 - math.pow((x / d0), m))
except ZeroDivisionError:
print("Divide by zero, ", x, d0)
return count
def cutoff_count(self, cutoff=0.35, switch=False):
"""
Get the atom contact matrix
Parameters
----------
cutoff: float, default is 0.35 angstrom
The distance cuntoff for the contacts
Returns
-------
self: return an instance of itself
"""
# get the inter-molecular atom contacts
if switch:
vect_switch = np.vectorize(self.switchFuction)
self.distance_matrix_ = vect_switch(self.distance_matrix_)
self.counts_ = (self.distance_matrix_ <= cutoff) * 1.0
return self
def get_elementtype(e):
#all_elements = ["H", "C", "O", "N", "P", "S", "Cl", "DU"]
if e in ALL_ELEMENTS:
return e
elif e in ['Cl', 'Br', 'I', 'F']:
return 'HAX'
else:
return "DU"
def generate_features(complex_fn, lig_code, ncutoffs):
keys = ["_".join(x) for x in list(itertools.product(ALL_ELEMENTS, ALL_ELEMENTS))]
# parse the pdb file and get the atom element information
cplx = AtomTypeCounts(complex_fn, lig_code)
cplx.parsePDB(rec_sele="protein", lig_sele=lig_code)
# element types of all atoms in the proteins and ligands
new_lig = list(map(get_elementtype, cplx.lig_ele))
new_rec = list(map(get_elementtype, cplx.rec_ele))
# the element-type combinations for all atom-atom pairs
rec_lig_element_combines = ["_".join(x) for x in list(itertools.product(new_rec, new_lig))]
cplx.distance_pairs()
counts = []
onion_counts = []
# calculate all contacts for all shells
for i, cutoff in enumerate(ncutoffs):
cplx.cutoff_count(cutoff)
if i == 0:
onion_counts.append(cplx.counts_)
else:
onion_counts.append(cplx.counts_ - counts[-1])
counts.append(cplx.counts_)
results = []
for n in range(len(ncutoffs)):
# count_dict = dict.fromkeys(keys, 0.0)
d = OrderedDict()
d = d.fromkeys(keys, 0.0)
# now sort the atom-pairs and accumulate the element-type to a dict
for e_e, c in zip(rec_lig_element_combines, onion_counts[n]):
d[e_e] += c
results += d.values()
return results, keys
def genfeat_mp(args):
f, l, nc = args
r, e = generate_features(f, l, nc)
print(f)
return r, e
if __name__ == "__main__":
print("Start Now ... ")
d = """
Predicting protein-ligand binding affinities (pKa) with OnionNet model.
Citation: Zheng L, Fan J, Mu Y. arXiv preprint arXiv:1906.02418, 2019.
Author: Liangzhen Zheng ([email protected])
This script is used to generate inter-molecular element-type specific
contact features. Installation instructions should be refered to
https://github.com/zhenglz/onionnet
Examples:
Show help information
python generate_features.py -h
Run the script
python generate_features.py -inp input_samples.dat -out features_samples.csv
# tutorial example
cd tuttorials/PDB_samples
python ../../generate_features.py -inp input_PDB_testing.dat -out PDB_testing_features.csv
"""
parser = argparse.ArgumentParser(description=d, formatter_class=RawDescriptionHelpFormatter)
parser.add_argument("-inp", type=str, default="input.dat",
help="Input. The input file containg the file path of each \n"
"of the protein-ligand complexes files (in pdb format.)\n"
"There should be only 1 column, each row or line containing\n"
"the input file path, relative or absolute path.")
parser.add_argument("-out", type=str, default="output.csv",
help="Output. Default is output.csv \n"
"The output file name containing the features, each sample\n"
"per row. ")
parser.add_argument("-lig", type=str, default="LIG",
help="Input, optional. Default is LIG. \n"
"The ligand molecule residue name (code, 3 characters) in the \n"
"complex pdb file. ")
parser.add_argument("-nt", type=int, default=1, help="Input, optional. Default is 1. Use how many of cpu cores.")
args = parser.parse_args()
if len(sys.argv) < 2:
parser.print_help()
sys.exit(0)
keys = ["_".join(x) for x in list(itertools.product(ALL_ELEMENTS, ALL_ELEMENTS))]
with open(args.inp) as lines:
lines = [x for x in lines if ("#" not in x and len(x.split()) >= 1)].copy()
inputs = [x.split()[0] for x in lines]
# defining the shell structures ... (do not change)
n_shells = 60
n_cutoffs = np.linspace(0.1, 3.1, n_shells)
results = []
ele_pairs = []
# computing the features now ...
l = len(inputs)
lig_code = args.lig
if args.nt <= 1:
for i, fn in enumerate(inputs):
# the main function for featurization ...
r, ele_pairs = generate_features(fn, lig_code, n_cutoffs)
results.append(r)
# success.append(1.)
print(fn, i, l)
else:
pool = mp.Pool(args.nt)
_args = list(zip(inputs, l * [lig_code,], l*[n_cutoffs]))
results_all = pool.map(genfeat_mp, _args)
results = [ x[0] for x in results_all]
# saving features to a file now ...
df = pd.DataFrame(results)
try:
df.index = inputs
except IndexError:
df.index = np.arange(df.shape[0])
col_n = []
for i, n in enumerate(keys * len(n_cutoffs)):
col_n.append(n + "_" + str(i))
df.columns = col_n
df.to_csv(args.out, sep=",", float_format="%.1f", index=True)
print("Feature extraction completed ...... ")