-
Notifications
You must be signed in to change notification settings - Fork 0
/
TL_Functions.py
457 lines (408 loc) · 19.2 KB
/
TL_Functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
import xml.etree.ElementTree as ET #For reading XML files
from rdkit import Chem #The module with the RDKit functions we will need
import os #For the function to read in the molecules from the .mol2 file
import numpy as np #For arrays and NumPy function
from math import sqrt, atan2, pi #For calculating dihedral angles
from math import ceil #Ceiling function
# Import the XML file, using this as a guide:
# https://docs.python.org/3/library/xml.etree.elementtree.html
tree = ET.parse("TL_2.1_VERSION_6.xml")
root = tree.getroot()
# We will turn the procedure for estimating torsion strain energy in
# TL_Lookup_Test.py into a function that we can call on every molecule
# in the list, which will return an object of the following class:
class TP_list(object):
def __init__(self, indeces, angles, smarts, hc, methods, E,
CI_l, CI_u, flags):
self.indeces = indeces #List of lists of indeces
self.angles = angles #List of angles
self.smarts = smarts #List of SMARTS strings
self.hc = hc #List of strings for hierarchy class
self.methods = methods #List of strings of energy-estimation methods
self.E = E #List of energy estimates
self.CI_l = CI_l #List of lower bounds for 95% CI of energy estimates
self.CI_u = CI_u #List of upper bounds for 95% CI of energy estimates
self.flags = flags
# List of Booleans flagging whether an angle whose "method" is
# "approximate" is not observed
self.TP_indeces = [j for j in range(len(indeces))]
# Indeces of the torsion patterns, not the atoms!
# Getters and setters:
def get_indeces(self):
return(self.indeces)
def set_indeces(self, inds):
self.indeces = inds
def get_angles(self):
return(self.angles)
def set_angles(self, angs):
self.angles = angs
def get_smarts(self):
return(self.smarts)
def set_smarts(self, sms):
self.smarts = sms
def get_hc(self):
return(self.hc)
def set_hc(self, hcs):
self.hc = hcs
def get_methods(self):
return(self.methods)
def set_methods(self, meths):
self.methods = meths
def get_E(self):
return(self.E)
def set_E(self, Es):
self.E = Es
def get_CI_l(self):
return(self.CI_l)
def set_CI_l(self, ls):
self.CI_l = ls
def get_CI_u(self):
return(self.CI_u)
def set_CI_u(self, us):
self.CI_u = us
def get_flags(self):
return(self.flags)
def set_flags(self, fs):
self.flags = fs
def get_TP_indeces(self): #Don't need a setter for this
return(self.TP_indeces)
# A method to return the information for a subset of the torsion patterns
def get_TPs(self, inds = None):
# The parameter inds is a list of indeces for the torsion patterns,
# not the atoms! We default to using all of the torsion patterns
if inds == None:
inds = [j for j in range(len(self.indeces))]
# Create a list of torsion pattern info to be returned:
tps = [] #Initialize
for j in inds:
tps.append([self.TP_indeces[j], self.E[j],
self.CI_l[j], self.CI_u[j], self.indeces[j], self.angles[j],
self.smarts[j], self.hc[j], self.methods[j], self.flags[j]])
return(tps)
# A method to find the sum of the energy estimates, with the 95% CI
def sum(self, cutoff = None):
# If at least one of the torsion patterns is flagged, then we
# do not calculate the estimate or CI. We return -1 * (number flagged)
flagged = sum(self.flags) #Treats True = 1 and False = 0
if flagged > 0:
return([-1*flagged, 0, 0]) #We do not try to calculate the CI
else:
if cutoff == None:
cutoff == 0 #The default cutoff is 0, or using every angle
ret = [0] * 3 #Initialize the returned array
for i in range(len(self.E)):
if self.E[i] >= cutoff:
# If the energy estimate is larger than the cutoff
ret[0] += self.E[i]
ret[1] += self.CI_l[i]
ret[2] += self.CI_u[i]
return(ret)
def Mol2MolSupplier (file = None):
names = [] #Make a list to hold the molecule names
mols = {} #Make a dictionary
with open(file, 'r') as f:
fileend = os.fstat(f.fileno()).st_size
count = 0
line = f.readline()
while not f.tell() == fileend:
if line.startswith("#") or line == '\n':
line = f.readline()
if line.startswith("@<TRIPOS>MOLECULE"):
count += 1
mol = []
mol.append(line)
line = f.readline()
if line != "\n" and line.split()[0].strip() not in names:
name = line.split()[0].strip()
print(name)
else:
name = "mol2Number" + str(count)
print(name)
while not line.startswith("@<TRIPOS>MOLECULE"):
mol.append(line)
line = f.readline()
if f.tell() == fileend:
mol.append(line)
break
block = ",".join(mol).replace(',','')
m = Chem.rdmolfiles.MolFromMol2Block(block, sanitize=False, removeHs = False)
names.append(name)
mols[name] = m
return(names, mols)
# Here is an updated version to use with the output "file" string buffer object
# created by the db2_file_like function in the db2_to_mol2.py script
def db2MolSupplier(file):
names = [] #Make a list to hold the molecule names
mols = {} #Make a dictionary
with file as f: #file is already opened as a string buffer
bufferend = len(f.getvalue())
count = 0
line = f.readline()
while not f.tell() == bufferend:
if line.startswith("#") or line == '\n':
line = f.readline()
if line.startswith("@<TRIPOS>MOLECULE"):
count += 1
mol = []
mol.append(line)
line = f.readline()
name = "db2Number" + str(count)
while not line.startswith("@<TRIPOS>MOLECULE"):
mol.append(line)
line = f.readline()
if f.tell() == bufferend:
mol.append(line)
break
block = ",".join(mol).replace(',','')
m = Chem.rdmolfiles.MolFromMol2Block(block, sanitize=False, removeHs = False)
names.append(name)
mols[name] = m
return(names, mols)
def unit(a):
# The argument should be a NumPy array with 1 axis
return(a / sqrt(np.dot(a,a))) #Scales a by its norm
def dihedral(a_1, a_2, a_3, a_4):
# The arguments should all be NumPy arrays with 1 axis and length 3
# These atoms should be in order, with a_2 and a_3 defining the bond
# of interest
# The 3 displacement vectors:
b_1 = a_2 - a_1
b_2 = a_3 - a_2
b_3 = a_4 - a_3
n_1 = unit(np.cross(b_1, b_2))
n_2 = unit(np.cross(b_2, b_3))
# Imagine the first atom (a_1) is above the middle bond (from a_2 to a_3),
# so that b_1 points downward. Then n_1 points out of the page
m = unit(np.cross(n_1, b_2))
# I moved the normalization to be after the cross product. Moving
# the normalization should not change the end result because the cross
# product commutes with scalar multiplication and ||n_1|| = 1
# Looking down b_2, we can consider n_1 to be the x-axis and
# m to be the y-axis. Then the dihedral angle is the angle that
# n_2 makes with the x-axis when projected into this plane. Since dihedral
# angles are measured going clockwise, we need to negate the angle
# that we get back from atan
x = np.dot(n_1, n_2) #Project n_2 onto n_1
y = np.dot(m, n_2) #Project n_2 onto m
return(-atan2(y,x) * 180 / pi) #Return the angle in degrees
# The next function we will need will calculate angular differences,
# in degrees. This function will calculate theta_1 - theta_2. See my notes
# from 9/19/19
def ang_diff(theta_1, theta_2):
# (-180,180] -> [0, 360)
if theta_1 < 0:
theta_1 += 360
if theta_2 < 0:
theta_2 += 360
del_theta = (theta_1 - theta_2) % 360 #Angular difference
# [0, 360) -> (-180, 180]
if del_theta > 180:
del_theta -= 360
return(del_theta)
# Test this works: ang_diff(0, 179), ang_diff(0, -179),
# and ang_diff(-179, 179)
# This function will allow us to do the matching for each torsion rule
def tp_match(tp, hc, j, mol, pos, bi):
# tp is a torsion pattern, hc is the type of hierarchyClass ("general" or
# "specific", and j is the current value for i
# This function turned the global variables mol, positions, and bond_info
# in TL_Lookup_Test.py into parameters mol, pos and bi, respectively
smarts = tp.get("smarts")
# Create the histograms for energy estimates and bounds of confidence
# intervals, if available
hist_E = [] #Initialize for energy estimates
hist_l = [] #Initialize for lower bounds of CIs
hist_u = [] #Initialize for upper bounds of CIs
if tp.get("method") == "exact":
for bin in tp.find("histogram_converted").findall("bin"):
hist_E.append(float(bin.get("energy")))
hist_l.append(float(bin.get("lower")))
hist_u.append(float(bin.get("upper")))
matches = mol.GetSubstructMatches(Chem.MolFromSmarts(smarts))
# A list of lists
for match in matches: #For each match
# Some of the SMARTS for the torion patterns actually have 5 atoms.
# We need to ingore these
if len(match) > 4:
continue #Go to the next match
if mol.GetAtomWithIdx(match[0]).GetSymbol()=='H' or mol.GetAtomWithIdx(match[3]).GetSymbol()=='H':
continue
# First get the atom locations
# Based on https://github.com/rdkit/rdkit/issues/1982
r_1 = np.array(pos[match[0]])
r_2 = np.array(pos[match[1]])
r_3 = np.array(pos[match[2]])
r_4 = np.array(pos[match[3]])
theta = dihedral(r_1, r_2, r_3, r_4) #Dihedral angle
# Changed next line from "TP.get" to "tp.get"
if tp.get("method") == "exact": #If using the exact method
# First figure out what bin we are in in the histogram.
# We define the bins by the right endpoints, like we did
# when we made the plots of the energy profiles
bin_num = ceil(theta / 10) + 17
# Starting with the -170 deg bin
energy = (hist_E[bin_num]-hist_E[(bin_num+35)%36])/10.0*(theta-(bin_num-17)*10)+hist_E[bin_num]
lower = (hist_l[bin_num]-hist_l[(bin_num+35)%36])/10.0*(theta-(bin_num-17)*10)+hist_l[bin_num]
upper = (hist_u[bin_num]-hist_u[(bin_num+35)%36])/10.0*(theta-(bin_num-17)*10)+hist_u[bin_num]
bi.append(
[
list(match), #Convert tuple to list
theta,
smarts,
hc, #"general" or "specific"
"exact",
energy,
lower,
upper,
False, #This only could apply for the approximate method
j #We will take this out when we create the final object
]
)
else: #If using the approximate method
not_observed = True
# Initialize the flag for not observing that angle
energy = 100
# Initialize the energy, in case we cannot estimate it
# Loop over all the possible angle peaks
for angle in tp.find("angleList").findall("angle"):
theta_0 = float(angle.get("theta_0")) #Peak location
delta = ang_diff(theta, theta_0) #Angular displacement
if abs(delta) <= float(angle.get("tolerance2")):
# If within the tolerance range for that peak
beta_1 = float(angle.get("beta_1"))
beta_2 = float(angle.get("beta_2"))
# The coefficients for the regression
energy = beta_1*(delta ** 2) + beta_2*(delta ** 4)
# Using the "not-as-small angle approximation"
not_observed = False
break
# Break the for loop to avoid problems if the
# observed angle sits at the border between two
# peaks
bi.append(
[
list(match), #Convert tuple to list
theta,
smarts,
hc, #"general" or "specific"
"approximate",
energy,
energy, #Lower bound for CI, which we cannot find for approx. method
energy, #Upper bound for CI, which we cannot find for approx. method
not_observed,
j #We will take this out when we create the final object
]
)
# This most general function will automate what we did in TL_Lookup_Test.py
def TL_lookup(mol): #mol is read in from the .mol2 file
positions = mol.GetConformer().GetPositions()
# List of lists of atom coordinates. Luckily RDKit starts indexing at 0
bond_info = []
# Initialize an empty list that will hold the information for each bond
i = 0 #Initialize count of torsion rules
# Loop over all of the specific hierarchy classes
for HC in root.findall("hierarchyClass"):
if HC.get("name") != "GG": #Not the general class
for TP in HC.iter("torsionRule"): #Loop over each torsion rules
tp_match(TP, "specific", i, mol, positions, bond_info)
i += 1 #Increase the count for the torsion rule
# Now for the general method:
for TP in root.find("hierarchyClass[@name='GG']").iter("torsionRule"):
tp_match(TP, "general", i, mol, positions, bond_info)
i += 1 #Increase the count for the torsion rule
# Now that we have all of the torsion patterns, we need to be able to find
# duplicates. The first such way is if the entire pattern is reversed. We
# can fix this problem by making sure that all of the lists of indeces have
# the second index (the first in the bond of interest) lower than the third
# index (the second in the bond of interest)
for bond in bond_info: #Loop over every bond
if bond[0][1] > bond[0][2]:
bond[0].reverse() #Reverse this list
bond.append(True) #Mark that we reversed this bond's indeces
else:
bond.append(False) #Mark that we did not reverse this bond's
# indeces. We will remove this marking later
# Next we condense the bond_info by the lists of 4 atoms defining the bonds.
# We will pick the entry of bond_info that has the lowest value for i for
# each match, since the torsion rules in the Torsion Library are arranged
# (within each hierarchy class or hierarchy subclass) in decreasing
# specificity, and we loop over all of the specific hierarchy classes
# before the general one
bond_info_red = [bond_info[0]] #Initialize a list for the reduced bond info
# This reduced list needs at least one element for checking subelements
for j in range(1, len(bond_info)):
# Skip the first bond, which is already in the reduced list
atom_0 = bond_info[j][0][0] #First atom index
atom_1 = bond_info[j][0][1] #Second atom index
atom_2 = bond_info[j][0][2] #Third bond index
atom_3 = bond_info[j][0][3] #Fourth bond index
unmatched = True #Initialize not finding a match
for k in range(len(bond_info_red)):
# Check against everything in the growing reduced list
if bond_info_red[k][0][0] == atom_0 \
and bond_info_red[k][0][1] == atom_1 \
and bond_info_red[k][0][2] == atom_2 \
and bond_info_red[k][0][3] == atom_3:
# If there is a match in ALL of the atom indeces
unmatched = False
if bond_info[j][9] < bond_info_red[k][9]:
# The ninth index gives the value for i
# If the new bond has a lower value, then we use it
# to replace the current one
bond_info_red[k] = bond_info[j]
break
# No need to continue looking for matches, since there
# should be no more than 1
if unmatched: #If no match
bond_info_red.append(bond_info[j]) #Append the current bond
# Now we condense the bond_info_red by the 2 atoms actually defining the
# bond. We will pick the entry of bond_info_red for each match that has
# the highest energy estimate, prioritizing torsion patterns from
# "specific" hierarchy classes over ones from the "general" hierarchy class
b_i_r = [bond_info_red[0]] #Initialize a list for the further reduced bond info
# I used the name b_i_r over bond_info_red_2
for j in range(1, len(bond_info_red)):
# Skip the first bond, which is already in the reduced list
atom_1 = bond_info_red[j][0][1] #First atom index of the bond
atom_2 = bond_info_red[j][0][2] #Second atom index of the bond
unmatched = True #Initialize not finding a match
for k in range(len(b_i_r)):
# Check against everything in the growing list
if b_i_r[k][0][1] == atom_1 and b_i_r[k][0][2] == atom_2:
# If there is a match in the two atom indeces defining the bond
unmatched = False
if bond_info_red[j][3][0] > b_i_r[k][3][0] \
or (
bond_info_red[j][5] > b_i_r[k][5] \
and
bond_info_red[j][3][0] == b_i_r[k][3][0]
):
# The third index gives "general" or "specific", and we
# access the first char in that string. We use:
# 's' > 's' gives False
# 'g' > 'g' gives False
# 'g' > 's' gives False
# 's' > 'g' gives True
# to prioritize using the specific classe over the general one.
# The fifth index gives the energy estimate
b_i_r[k] = bond_info_red[j] #Replace the current bond info
break
# No need to continue looking for matches, since there
# should be no more than 1
if unmatched: #If no match
b_i_r.append(bond_info_red[j]) #Append the current bond
# Now that we have only the bond information that we want, we need to
# un-do any times we reversed the atom indeces
for bond in b_i_r:
if bond[10]:
# The tenth index gives whether or not we reversed the indeces
bond[0].reverse() #Reverse this list
# Now that we have all the information we need, we can return the
# object that contains it
return(
TP_list([bond[0] for bond in b_i_r], [bond[1] for bond in b_i_r],
[bond[2] for bond in b_i_r], [bond[3] for bond in b_i_r],
[bond[4] for bond in b_i_r], [bond[5] for bond in b_i_r],
[bond[6] for bond in b_i_r], [bond[7] for bond in b_i_r],
[bond[8] for bond in b_i_r]) #Using Python list comprehension
)