-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathfeaturize_substances
executable file
·308 lines (265 loc) · 12.4 KB
/
featurize_substances
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# -*- tab-width:4;indent-tabs-mode:nil;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
# vi: set ts=2 noet:
import sys
import argparse
import os
import numpy as np
import pandas as pd
import pyarrow.parquet
import pyarrow as pa
from MPLearn import embedding
from MPLearn.chemoinformatics import chemical_space
DESCRIPTION = """Embed a set of substances in chemical space
version 0.0.1
Example:
cd MPProjects/KCNQ
date_code=$(date '+%Y%m%d')
embed_substances \
--library_path raw_data/project_substances_${date_code}.sdf \
--output_path intermediate_data/project_substances_${date_code}
# generates:
# intermediate_data/
#
"""
def main(argv):
parser = argparse.ArgumentParser(description = DESCRIPTION)
parser.add_argument(
"--library_type",
type = str,
action = "store",
dest = "library_type",
default = None,
help = "Type of library given at library path, options are ['sdf[.gz]', 'mol2', 'tsv', 'csv', 'smi'] or None, in which case it is inferred by the file extension")
parser.add_argument(
"--library_path",
type = str,
action = "store",
dest = "library_path",
help = "'.sdf[.gz]', '.mol2', '.tsv', '.csv', or '.smi' substances sampled from chemical space")
# arguments for .sdf file
# as a convience extract fields for substances from the sdf file and store
# in output
parser.add_argument(
"--library_fields",
nargs = "+",
type = str,
action = "store",
dest = "library_fields",
default = None,
help = """Fields from the .sdf or .mol2 library that are stored in the results file (Default: all)""")
parser.add_argument(
"--substance_id_field",
type = str,
action = "store",
dest="substance_id_field",
default = None,
help = """The field for the substance identifier
* If input is .tsv or .csv, this is the field name that contains the substance_id
* If input is .mol2 or .sdf default to the substance name""")
parser.add_argument(
"--smiles_field",
type = str,
action = "store",
dest="smiles_field",
default = "smiles",
help = """If input is .tsv or .csv this is the field name that contains the smiles""")
parser.add_argument(
"--output_path",
type = str,
action = "store",
dest = "output_path",
help = "output path for results")
parser.add_argument(
"--fingerprint_type", type=str, action="store", dest="fingerprint_type", default="ECFP4",
help="""Fingerprint type for the query and library compounds over which to compute the tanimoto similarity""")
parser.add_argument(
"--fingerprint_n_bits",
type = int,
action = "store",
dest = "fingerprint_n_bits",
default = 1024,
help = """Fingerprint number of bits, defaults to 1024""")
parser.add_argument(
"--device",
type = str,
action = "store",
dest = "device",
default = "cpu",
help = """Hardware device e.g. 'cpu', 'gpu:1', etc. DEFAULT: cup""")
parser.add_argument(
"--verbose", action="store_true", dest="verbose",
help="""Give verbose output (Default: False)""")
arguments = parser.parse_args(argv[1:])
######################
# validate arguments #
######################
if arguments.library_type is None:
library_type = arguments.library_path.split(".")[-1]
else:
library_type = arguments.library_type
if library_type == "sdf":
if arguments.verbose:
print("Parsing library as a structure data (sdf) file")
elif library_type == "mol2":
if arguments.verbose:
print("Parsing library as a structure data (mol2) file")
elif library_type == "tsv":
if arguments.verbose:
print("Parsing library as tab-separated value (tsv) table")
elif library_type == "csv":
if arguments.verbose:
print("Parsing library as comma-separated value (csv) table")
elif library_type == "smi":
if arguments.verbose:
print("Parsing library as .smi file")
elif library_type == "gz":
library_type = arguments.library_path.split(".")[-2]
if library_type == "sdf":
if arguments.verbose:
print("Parsing library as a structure data (sdf) file")
else:
print(f"ERROR: Currently .sdf.gz is the only compressed library type that is supported")
print(f"ERROR: Unrecognized library_path extension or specified file type '{arguments.library_path}'")
exit(1)
else:
print(f"ERROR: Unrecognized library_path extension or specified file type '{arguments.library_path}'")
print(f"Recognized extensions are ['sdf', 'sdf.gz', 'mol2', 'tsv', 'csv', 'smi']")
exit(1)
# check that the output path exists
if os.path.isdir(arguments.output_path):
if arguments.verbose:
print(f"Output directory '{arguments.output_path}' already exists.")
else:
if arguments.verbose:
print(f"Creating output directory '{arguments.output_path}' ...")
os.mkdir(arguments.output_path)
###################################
# Get fingerprints for substances #
###################################
if library_type == "sdf":
if arguments.library_fields is None:
library_fields = ['_Name']
else:
library_fields = arguments.library_fields
if arguments.substance_id_field is None:
substance_id_field = '_Name'
else:
substance_id_field = arguments_id_field
if substance_id_field not in library_fields:
print(f"ERROR: substance_id_field {substance_id_field} is not in the library_fields [{', '.join(library_fields)}].")
exit(1)
substance_info, fingerprints = chemical_space.generate_fingerprints_sdf(
library_path = arguments.library_path,
fields = library_fields,
fingerprint_type = arguments.fingerprint_type,
device = arguments.device,
verbose = arguments.verbose)
substance_info.to_csv(f"{arguments.output_path}/substance_info.tsv", sep="\t")
substance_ids = substance_info[substance_id_field]
elif library_type == "mol2":
if arguments.library_fields is None:
library_fields = ['_Name']
else:
library_fields = arguments.library_fields
if arguments.substance_id_field is None:
substance_id_field = '_Name'
else:
substance_id_field = arguments_id_field
if substance_id_field not in library_fields:
print(f"ERROR: substance_id_field {substance_id_field} is not in the library_fields [{', '.join(library_fields)}].")
exit(1)
substance_info, fingerprints = chemical_space.generate_fingerprints_mol2(
library_path = arguments.library_path,
fields = arguments.library_fields,
fingerprint_type = arguments.fingerprint_type,
device = arguments.device,
verbose = arguments.verbose)
substance_info.to_csv(f"{arguments.output_path}/substance_info.tsv", sep="\t")
substance_ids = substance_info['_Name']
elif library_type == "tsv":
substance_data = pd.read_csv(arguments.library_path, sep = "\t")
if arguments.smiles_field not in substance_data.columns:
print(f"ERROR: The field is not in '{arguments.smiles_field}' the library file '{arguments.library_path}'")
print(f"ERROR: Possible smiles columns are [{', '.join([name for name in substance_data.columns])}]")
exit(1)
if arguments.substance_id_field not in substance_data.columns:
print(f"ERROR: The field is not in '{arguments.substance_id_field}' the library file '{arguments.library_path}'")
print(f"Possible substance_id columns are [{', '.join([name for name in substance_data.columns])}]")
exit(1)
substance_ids, fingerprints = chemical_space.generate_fingerprints_smiles(
smiles = substance_data[arguments.smiles_field],
substance_ids = substance_data[arguments.substance_id_field],
fingerprint_type = arguments.fingerprint_type,
fingerprint_n_bits = arguments.fingerprint_n_bits,
device = arguments.device,
verbose = arguments.verbose)
elif library_type == "csv":
substance_data = pd.read_csv(arguments.library_path, sep = ",")
if arguments.smiles_field not in substance_data.columns:
print(f"ERROR: The field is not in '{arguments.smiles_field}' the library file '{arguments.library_path}'")
print(f"Possible smiles columns are [{', '.join([name for name in substance_data.columns])}]")
exit(1)
if arguments.substance_id_field not in substance_data.columns:
print(f"ERROR: The field is not in '{arguments.substance_id_field}' the library file '{arguments.library_path}'")
print(f"ERROR: Possible smiles columns are [{', '.join([name for name in substance_data.columns])}]")
exit(1)
substance_ids, fingerprints = chemical_space.generate_fingerprints_smiles(
smiles = substance_data[arguments.smiles_field],
substance_ids = substance_data[arguments.substance_id_field],
fingerprint_type = arguments.fingerprint_type,
fingerprint_n_bits = arguments.fingerprint_n_bits,
device = arguments.device,
verbose = arguments.verbose)
elif library_type == "smi":
substance_data = pd.read_csv(
arguments.library_path,
sep = " ",
names = ["smiles", "substance_id"])
substance_ids, fingerprints = chemical_space.generate_fingerprints_smiles(
smiles = substance_data["smiles"],
substance_ids = substance_data["substance_id"],
fingerprint_type = arguments.fingerprint_type,
fingerprint_n_bits = arguments.fingerprint_n_bits,
device = arguments.device,
verbose = arguments.verbose)
else:
print(f"ERROR: Unrecognized library_path extension or specified file type '{arguments.library_path}'")
print(f"ERROR: Recognized extensions are ['sdf', 'sdf.gz', 'mol2', 'tsv', 'csv', 'smi']")
exit(1)
################
# Save results #
################
if len(fingerprints) > 0:
feature_columns = pd.DataFrame({
'feature' : [ f"bit_{i}" for i in range(fingerprints.shape[1])],
'transform' : ["identity" for i in range(fingerprints.shape[1])]
})
feature_columns.to_csv(f"{arguments.output_path}/fingerprint_feature_columns.tsv", sep = "\t")
fingerprints_df = pd.DataFrame(fingerprints, columns = feature_columns['feature'])
if type(substance_ids) == np.ndarray and substance_ids.dtype == np.dtype('O'):
substance_ids = substance_ids.astype(str)
fingerprints_df.insert(loc=0, column="substance_id", value=substance_ids)
print(f"Writing fingerprints DataFrame to disk '{arguments.output_path}/fingerprints.parquet")
pa.parquet.write_table(
table=pa.Table.from_pandas(fingerprints_df),
where = f"{arguments.output_path}/fingerprints.parquet")
with open(f"{arguments.output_path}/fingerprints_info.tsv", 'w') as f:
f.write("key\tvalue\n")
f.write(f"library_type\n{arguments.library_type}\n")
f.write(f"library_path\n{arguments.library_path}\n")
if library_type in ["sdf", "mol2"]:
f.write(f"library_fields\n{';'.join(arugments.library_fields)}\n")
elif library_type == "tsv":
f.write(f"substancce_id_field\t{arguments.substance_id_field}\n")
f.write(f"smiles_field\t{arguments.smiles_field}\n")
f.write(f"fingerprint_type\t{arguments.fingerprint_type}\n")
if arguments.fingerprint_type == "ECFP4":
f.write(f"fingerprint_n_bits\t{arguments.fingerprint_n_bits}\n")
elif arguments.fingerprint_type.startswith("huggingface"):
f.write(f"device\t{arguments.device}\n")
f.write(f"n_substances\n{len(fingerprints)}\n")
else:
if verbose:
print("WARNING: No fingeprints were generated.")
if __name__ == '__main__':
main(argv = sys.argv)