This repository has been archived by the owner on Mar 16, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathanalysis.py
69 lines (56 loc) · 3.62 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import argparse
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr
from scipy.stats import kendalltau
from matplotlib import pyplot as plt
import seaborn as sns
def to_scatter_plot(file_name: str, docking_scores: np.ndarray, experimental_scores: np.ndarray):
sns.set()
plt.scatter(docking_scores, experimental_scores, color='g')
plt.title(file_name + ' Scatter Plot')
plt.xlabel('Docking Score (-kcal/mol)')
plt.ylabel('Experimental Binding (-kcal/mol)')
plt.savefig(file_name.replace('csv', ''), dpi=300)
plt.figure()
def output_analysis(data_dir: str, exp_data_path: str, output_dir: str) -> dict:
docking_results = [file for file in os.listdir(data_dir) if file.endswith('.csv')]
analysis_results = {}
for file_name in docking_results:
data_path = os.path.join(data_dir, file_name)
# load the docking results
docking_data = pd.read_csv(data_path)
# load the experimental comparison data
experimental_data = pd.read_csv(exp_data_path)
# merge the docking results and experimental binding data based on their 'smiles' identifiers.
# this is to ensure that data pertaining to the correct 'smiles' ligands are compared irrespective
# of the order in which the user provides the experimental data
comparison_df = docking_data.merge(experimental_data, on='smiles')
# extract the DockStream docking scores and experimental potency parameter columns
docking_scores = comparison_df['score'].to_numpy().reshape(-1,1)
experimental_data = comparison_df['exp_binding'].to_numpy()
# fit a linear model using least squares
model = LinearRegression().fit(docking_scores, experimental_data)
coeff_determination = model.score(docking_scores, experimental_data)
# perform Spearman correlation analysis
spearman_correlation = spearmanr(docking_scores, experimental_data)
# perform Kendall correlation analysis
kendall_correlation = kendalltau(docking_scores, experimental_data)
# store the linear model and spearman quantities in a dictionary
analysis_results[file_name] = {'coefficient of determination': coeff_determination, 'Spearman correlation':
spearman_correlation, 'Kendall correlation': kendall_correlation}
# call to_scatter_plot to create a scatter plot of docking_scores against experimental_data
output_path = os.path.join(output_dir, file_name)
to_scatter_plot(output_path, docking_scores, experimental_data)
return analysis_results
if __name__ == '__main__':
# take user specified input parameters to run the analysis script
parser = argparse.ArgumentParser(description='Implements entry point to results analysis.')
parser.add_argument('-data_dir', type=str, required=True, help='The path to the output csv files to be analyzed.')
parser.add_argument('-exp_data_path', type=str, required=True, help='The path to the experimental binding data of your ligand library. This will be used for regression analysis.')
parser.add_argument('-output_dir', type=str, required=True, help='The desired output folder path to store the analyzed results and plots.')
#parser.add_argument('-regression_threshold', type=int, default=0.75, help='The desired regression threshold to classify good/poor backend performances based on your receptor-ligand system. The default value if not specified is 0.75.')
args = parser.parse_args()
analysis_results = output_analysis(args.data_dir, args.exp_data_path, args.output_dir)