-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtauRAMD-v2.py
188 lines (164 loc) · 7.41 KB
/
tauRAMD-v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
#------------------------------------------------------------------------
# Computation of the drug-target relative residence times from RAMD simulations
#> \version{version 1.1 (March. 2020)}
#> <c
#> Copyright (c) 2020
#> Heidelberg Institute of Theoretical Studies (HITS, www.h-its.org)
#> Schloss-Wolfsbrunnenweg 35
#> 69118 Heidelberg, Germany
#>
#> Please send your contact address to get information on updates and
#> new features to "[email protected]". Questions will be
#> answered as soon as possible.
#>
#> Authors: Daria Kokh [email protected]
"""
from matplotlib import *
from matplotlib import gridspec
import pylab as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
import sys
soft = "Gr" # if Gromacs software was used for RAMD simulations; otherwise define soft = 'NAMD'
def printUsage():
print ('''\
NAME
tauRAMD-v1.py - computation of residence times using bootstrapping from RAMD output generated in Gromacs
USAGE
python tauRAMD-v1.py input_file[s]
input files must contain a set of lines extracted from the gromacs (or NAMD) output. Each line contains the number of steps executed before dissociation
and has the following format:
for Gromacs: “XX/YYYY.out:==== RAMD ==== GROMACS will be stopped after 874650 steps.”
for NAMD: "EXIT: XXXXXXX > LIGAND EXIT EVENT DETECTED"
OUTPUT
residence time with the standard deviation computed for each input_file and an image with histogram representation of the bootstrapping output
''')
def bootstrapp(t, rounds=50000):
max_shuffle = rounds
alpha = 0.8
sub_set = int(alpha*len(t))
tau_bootstr = []
for i in range(1,max_shuffle):
# generate a sub-set
np.random.shuffle(t)
t_b = t[:sub_set]
# find residence time from a sub-stet
t_b_sorted_50 =(np.sort(t_b)[int(len(t_b)/2.0-0.5)]+np.sort(t_b)[int(len(t_b)/2)])/2.0
tau_bootstr.append(t_b_sorted_50)
return(tau_bootstr)
#---- reading data
d_list = []
if len(sys.argv) < 2:
printUsage()
sys.exit(1)
else:
for i in range(1,len(sys.argv)):
if os.path.isfile(sys.argv[i]):
print ("Data found:", sys.argv[i])
d_list.append(sys.argv[i])
else:
print ("Data not found:", sys.argv[i])
times_set = []
for t,d in enumerate(d_list):
with open(d) as f:
read_data = f.readlines()
times = []
for r in read_data:
try:
if soft == "NAMD":
times.append(int(r[r.find("EXIT:")+6:r.find(">")-2])) # if NAMD was used to generate RAMD trajectories
else:
times.append(int(r[r.find("after")+6:r.find("steps")-1])) # if Gromacs was used to generate RAMD trajectories
except:
print("Input files dont't have correct format")
printUsage()
sys.exit()
# print(times)
times = np.asarray(times)/500000.
times_set.append(times)
print("************ Dissociation times for "+d+" ****************")
print(times)
fig = plt.figure(figsize = (2*len(d_list), 7))
gs = gridspec.GridSpec(nrows=3, ncols=len(d_list), wspace=0.1,hspace=0.6)
mue_set = []
print("\n ============== Bootstrapping and computation of tau for each replica==================\n")
for t, times in enumerate(times_set):
#--- do bootstrapping ----
if len(times) > 8:
for ti in range(len(times),15): times = np.concatenate((times,[55.0]))
print(times)
ax0 = fig.add_subplot(gs[0, t])
ax0.hist(times,bins=int(len(times)/2),cumulative=True,histtype="step",color='k',lw=1)
plt.title("raw CDF",fontsize=12)
ax0.set_xlabel('dissociation time [ns]', fontsize=10)
ax0.plot([min(times), max(times)],[len(times)/2.0,len(times)/2.0], color='red', alpha = 0.5)
if (t> 0): ax0.set_yticklabels( [])
bt2 = bootstrapp(times, rounds=50000)
#--- make a plot------
bins = 6
ax1 = fig.add_subplot(gs[1, t])
ax1.hist(x=bt2,bins=bins, alpha=0.8,density=True,histtype="step")
mu, std = norm.fit(bt2)
mue_set.append(np.round(mu,1))
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
#xmax = np.round(max(times))
x = np.linspace(0.8*xmin, xmax, 100)
p = norm.pdf(x, mu, std)
ax1.plot(x, p, 'k', linewidth=2)
ax1.plot([mu,mu],[0, max(p)], color='red', alpha = 0.5)
ax1.plot([xmin, xmax],[max(p)/2.0,max(p)/2.0], color='red', alpha = 0.5)
ax1.plot([0.8*xmin, mu],[max(p),max(p)], color='red', linestyle='dashed',alpha = 0.5)
ax1.set_xlabel('res. time [ns]', fontsize=10)
plt.title("tau distribution",fontsize=12)
ax1.set_yticks([])
ax2 = fig.add_subplot(gs[2, t])
xmin = min(times)
xmax = np.round(max(times))
tp = np.linspace(xmin*0.5,xmax*1.5,100)
poisson = 1-np.exp(-tp/mu) #np.cumsum(1-np.exp(-np.linspace(xmin,xmax,10)/mu))
points=len(times)
bins = len(times)
times = np.asarray(times)
hist, bin_edges = np.histogram(times,bins=bins)
hist_center = []
for i,b in enumerate(bin_edges):
if i > 0: hist_center.append((bin_edges[i-1]+bin_edges[i])/2.0)
CD = np.cumsum(hist)/np.max(np.cumsum(hist))
ax2.scatter(np.log10(np.asarray(hist_center)),CD,marker='o')
ax2.set_xlabel('log(res. time [ns])', fontsize=10)
ax2.plot(np.log10(tp),poisson,color = 'k')
ax2.set_ylim(0,1)
ax2.set_xlim(-1.5,1.5)
# ax2.set_xlim(np.round(np.log10(np.asarray(hist_center))[0],1)-0.1,np.log10(xmax*1.5)) #max(np.log10(np.asarray(hist_center))[-1],np.log10(tp)[-1]))
ax2.set_yticks(np.linspace(0,1,5))
if (t> 0): ax2.set_yticklabels( [])
plt.grid(linestyle = '--',linewidth=0.5)
ax2.plot([np.log10(mu),np.log10(mu)],[0, 1], color='red', alpha = 0.5)
# p_mu = 1-np.exp(-1.0)
# cd_mu = np.argwhere(np.asarray(hist_center)> p_mu)[0][0]
# if cd_mu > 1:
# ks = abs(p_mu - (CD[cd_mu]+CD[cd_mu-1])/2)
# else:
# ks = abs(p_mu - CD[cd_mu])
KS = np.round(np.max(np.abs(1-np.exp(-(np.asarray(hist_center))/mu) - CD)),2)
plt.title("KS test:"+str(KS),fontsize=12)
print(" Relative res. time and SD: ",np.round(mu,2), np.round(std,2),"KS test:",KS)
print("-----------------------------------------------------------------")
#fig.align_labels()
plt.savefig('res_times.png', bbox_inches='tight',dpi=300)
fig = plt.figure(figsize = (2*len(d_list), 2))
meanpointprops = dict(linestyle='--', linewidth=1.5, color='firebrick')
medianpointprops = dict(linestyle='-', linewidth=2.0, color='orange')
#plt.yticks(np.linspace(0,ymax,10),np.linspace(0,ymax,10))
plt.boxplot(times_set,showmeans=True, meanline=True,meanprops=meanpointprops,medianprops = medianpointprops, bootstrap=5000) #labels = mue_set)
ymin, ymax = plt.ylim()
plt.ylim=(0,ymax)
plt.grid(linestyle = '--',linewidth=0.5)
plt.yticks(np.linspace(0,int(ymax),min(int(ymax)+1,11)), fontsize=9)
plt.ylabel('residence time [ns]', fontsize=10)
plt.title("Residence times for "+str(t+1)+" replicas, mean: "+str(np.round(np.mean(mue_set),2))+" std: "+str(np.round(np.std(mue_set),2)),fontsize=10)
plt.savefig('res_times_summary.png', bbox_inches='tight',dpi=300)
print("Residence times for "+str(t+1)+" replicas, mean: "+str(np.round(np.mean(mue_set),2))+" std: "+str(np.round(np.std(mue_set),2)))