-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_decision_tree_mona_ms2deep.py
61 lines (51 loc) · 1.68 KB
/
run_decision_tree_mona_ms2deep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import numpy as np
import selfies as sf
from matchms.filtering import default_filters
from matchms.importing import load_from_mgf
from ms2deepscore import MS2DeepScore
from ms2deepscore.models import load_model
from rdkit import Chem
from tqdm.auto import tqdm
from src import run
from src.models import MultiOutputDecisionTree
def main():
data = []
for i in tqdm(load_from_mgf("MONA.mgf")):
spec = default_filters(i)
if spec.get("smiles") and "." not in spec.get("smiles"):
smiles = Chem.MolToSmiles(Chem.MolFromSmiles(spec.get("smiles")))
spec.set("smiles", smiles)
data.append(spec)
selfi = [sf.encoder(s.get("smiles")) for s in tqdm(data)]
alphabet = sf.get_alphabet_from_selfies(selfi)
alphabet.add("[nop]")
alphabet = list(sorted(alphabet))
PAD_TO_LEN = max(sf.len_selfies(s) for s in selfi)
SYMBOL_TO_IDX = {s: i for i, s in enumerate(alphabet)}
IDX_TO_SYMBOL = {i: s for i, s in enumerate(alphabet)}
# Load pretrained model
model = load_model("ms2deepscore_model.pt")
X = MS2DeepScore(model, progress_bar=True).get_embedding_array(data)
y = [
sf.selfies_to_encoding(
selfies=i,
vocab_stoi=SYMBOL_TO_IDX,
pad_to_len=PAD_TO_LEN,
enc_type="label",
)
for i in selfi
]
y = np.array(y)
run(
X,
y,
data_name="mona_ms2deepscore",
number_of_external_holdouts=10,
number_of_internal_holdouts=3,
model_class=MultiOutputDecisionTree,
max_evals=100,
SYMBOL_TO_IDX=SYMBOL_TO_IDX,
IDX_TO_SYMBOL=IDX_TO_SYMBOL,
)
if __name__ == "__main__":
main()