-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbagofwords.py
66 lines (64 loc) · 2.95 KB
/
bagofwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from __future__ import print_function
from rdkit import Chem
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from mol2vec.features import mol2alt_sentence
from mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg
molecules = pd.read_csv('data.csv', header=0) # load data into pandas dataframe
aa = [Chem.MolFromSmiles(s) for s in molecules.smiles] # extract molecule objects from SMILES
aa_sentences = [mol2alt_sentence(x, 1) for x in aa] # extract sequence of substructures (Morgan identifiers)
# Write aa_sentences to strings
# textMol is a list of strings where each element corresponds to a molecule & is a sentence of its Morgan identifiers
textMol = []
for i in range(len(aa_sentences)):
seq = ''
# seq will be a sentence of Morgan identifiers for substructures
for j in range(len(aa_sentences[i])):
seq += aa_sentences[i][j]+' '
textMol.append(seq)
# Extract TF-IDF features from textMol
vect = TfidfVectorizer(min_df=0.01, ngram_range=(1, 5))
features = vect.fit_transform(textMol).toarray()
# Store AUCs for random forests & logistic regressions
rfScores = []
lrScores = []
rfTrees = [40, 80, 30, 120, 30, 70, 30, 60, 10, 60, 150, 20] # optimal numbers of trees for random forests
lrC = [4, 4, 28, 49, 21, 49, 36, 34, 9, 48, 40, 49] # optimal regularization coefficient for logistic regressions
coefs = [] # store regression coefficients
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for t in range(12): # iterate over targets
y = np.array(molecules['target'+str(t+1)])
# remove molecules with NaN targets
notNan = ~np.isnan(y)
X = features[notNan]
y = y[notNan]
y_values = []
predictionsRf = []
predictionsLr = []
probasRf = []
probasLr = []
for train, test in kf.split(X, y): # for each fold of k-fold cross validation
rf = RandomForestClassifier(n_estimators=rfTrees[t])
lr = LogisticRegression(C=lrC[t])
# Fit models
rf.fit(X[train], y[train])
lr.fit(X[train], y[train])
# Store predicted labels
predictionsRf.append(rf.predict(X[test]))
predictionsLr.append(lr.predict(X[test]))
coefs.append(lr.coef_)
# Store class 1 probabilities
probasRf.append(rf.predict_proba(X[test]).T[1])
probasLr.append(lr.predict_proba(X[test]).T[1])
y_values.append(y[test]) # ground-truth labels
del rf, lr
# Store AUCs
rfScores.extend([roc_auc_score(y, proba) for y, proba in zip(y_values, probasRf)])
lrScores.extend([roc_auc_score(y, proba) for y, proba in zip(y_values, probasLr)])
# Creates a figure of molecule 3 with the substructure 1397494279 highlighted
depict_identifier(aa[2], 1397494279, 1)