-
Notifications
You must be signed in to change notification settings - Fork 150
/
lipophilicity.py
146 lines (123 loc) · 5.63 KB
/
lipophilicity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf-8 -*-
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Lipophilicity from MoleculeNet for the prediction of octanol/water
# distribution coefficient (logD at pH 7.4) of 4200 compounds
import pandas as pd
from dgl.data.utils import get_download_dir, download, _get_dgl_url, extract_archive
from .csv_dataset import MoleculeCSVDataset
__all__ = ['Lipophilicity']
class Lipophilicity(MoleculeCSVDataset):
r"""Lipophilicity from MoleculeNet for the prediction of octanol/water
distribution coefficient (logD at pH 7.4)
Quoting [1], "Lipophilicity is an important feature of drug molecules that affects both
membrane permeability and solubility. This dataset, curated from ChEMBL database,
provides experimental results of octanol/water distribution coefficient (logD at pH 7.4)
of 4200 compounds."
References:
* [1] MoleculeNet: A Benchmark for Molecular Machine Learning.
* [2] ChEMBL Deposited Data Set - AZ dataset; 2015.
* [3] DeepChem
Parameters
----------
smiles_to_graph: callable, str -> DGLGraph
A function turning a SMILES string into a DGLGraph. If None, it uses
:func:`dgllife.utils.SMILESToBigraph` by default.
node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for nodes like atoms in a molecule, which can be used to update
ndata for a DGLGraph. Default to None.
edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for edges like bonds in a molecule, which can be used to update
edata for a DGLGraph. Default to None.
load : bool
Whether to load the previously pre-processed dataset or pre-process from scratch.
``load`` should be False when we want to try different graph construction and
featurization methods and need to preprocess from scratch. Default to False.
log_every : bool
Print a message every time ``log_every`` molecules are processed. Default to 1000.
cache_file_path : str
Path to the cached DGLGraphs, default to 'lipophilicity_dglgraph.bin'.
n_jobs : int
The maximum number of concurrently running jobs for graph construction and featurization,
using joblib backend. Default to 1.
Examples
--------
>>> from dgllife.data import Lipophilicity
>>> from dgllife.utils import SMILESToBigraph, CanonicalAtomFeaturizer
>>> smiles_to_g = SMILESToBigraph(node_featurizer=CanonicalAtomFeaturizer())
>>> dataset = Lipophilicity(smiles_to_g)
>>> # Get size of the dataset
>>> len(dataset)
4200
>>> # Get the 0th datapoint, consisting of SMILES, DGLGraph and logD
>>> dataset[0]
('Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14',
DGLGraph(num_nodes=24, num_edges=54,
ndata_schemes={'h': Scheme(shape=(74,), dtype=torch.float32)}
edata_schemes={}),
tensor([3.5400]))
We also provide information for the ChEMBL id of the compound.
>>> dataset.chembl_ids[i]
We can also get the ChEMBL id along with SMILES, DGLGraph and logD at once.
>>> dataset.load_full = True
>>> dataset[0]
('Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14',
DGLGraph(num_nodes=24, num_edges=54,
ndata_schemes={'h': Scheme(shape=(74,), dtype=torch.float32)}
edata_schemes={}),
tensor([3.5400]),
'CHEMBL596271')
"""
def __init__(self,
smiles_to_graph=None,
node_featurizer=None,
edge_featurizer=None,
load=False,
log_every=1000,
cache_file_path='./lipophilicity_dglgraph.bin',
n_jobs=1):
self._url = 'dataset/lipophilicity.zip'
data_path = get_download_dir() + '/lipophilicity.zip'
dir_path = get_download_dir() + '/lipophilicity'
download(_get_dgl_url(self._url), path=data_path, overwrite=False)
extract_archive(data_path, dir_path)
df = pd.read_csv(dir_path + '/Lipophilicity.csv')
super(Lipophilicity, self).__init__(df=df,
smiles_to_graph=smiles_to_graph,
node_featurizer=node_featurizer,
edge_featurizer=edge_featurizer,
smiles_column='smiles',
cache_file_path=cache_file_path,
task_names=['exp'],
load=load,
log_every=log_every,
init_mask=False,
n_jobs=n_jobs)
self.load_full = False
# ChEMBL ids
self.chembl_ids = df['CMPD_CHEMBLID'].tolist()
self.chembl_ids = [self.chembl_ids[i] for i in self.valid_ids]
def __getitem__(self, item):
"""Get datapoint with index
Parameters
----------
item : int
Datapoint index
Returns
-------
str
SMILES for the ith datapoint
DGLGraph
DGLGraph for the ith datapoint
Tensor of dtype float32 and shape (1)
Labels of the ith datapoint
str, optional
ChEMBL id of the ith datapoint, returned only when
``self.load_full`` is True.
"""
if self.load_full:
return self.smiles[item], self.graphs[item], self.labels[item], self.chembl_ids[item]
else:
return self.smiles[item], self.graphs[item], self.labels[item]