-
Notifications
You must be signed in to change notification settings - Fork 7
/
aa_workflow.py
415 lines (328 loc) · 19.8 KB
/
aa_workflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from deepnovo_preprocess import *
from deepnovo_postprocess import *
import aa_workflow_step_4_2
import aa_workflow_step_5
data_fasta_dir = "data.fasta/"
patient_id = "Mel16"
data_training_dir = "data.training/aa.hla.bassani.nature_2016.mel_16.class_1/"
num_fractions = 11
model_dir = "train.mel_16.class_1" # before training, create this empty folder at the same level as Python scripts.
# ================================================================================
# Workflow of neoantigen discovery by personalized de novo sequencing.
# ================================================================================
# Step-by-step instructions based on the following example dataset:
# Patient Mel-16 (Bassani-Sternberg et al., Nature Communication, 2016)
# HLA class 1: 12 raw files, 1 failed to run PEAKS
# 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_1_A
# 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_1_B
# 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_2_A
# 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_2_B
# 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_3_A
# 20141212_QEp7_MiBa_SA_HLA-I-p_MM16_3_B
# 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_1_A_1
# 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_1_B_1, failed
# 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_2_A_1
# 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_2_B_1
# 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_3_A_1
# 20141213_QEp7_MiBa_SA_HLA-I-p_MM16_3_B_1
# ================================================================================
# Step 1: Build the immunopeptidome of the patient.
# ================================================================================
# This step 1 took about ?? hours on a laptop with 4 CPU cores i7, 16 GB memory
# ================================================================================
# Step 1.1: Run PEAKS X DB search on the raw files with the following parameters:
# ================================================================================
# Enzyme: None
# Instrument: Orbi-Orbi
# Fragment: HCD
# Acquisition: DDA
# Parent Mass Error Tolerance: 15.0 ppm
# Fragment Mass Error Tolerance: 0.05 Da
# Precursor Mass Search Type: monoisotopic
# Enzyme: None
# Digest Mode: Unspecific
# Max Missed Cleavages: 100
# Variable Modifications:
# Oxidation (M): 15.99
# Deamidation (NQ): 0.98
# Max Variable PTM Per Peptide: 3
# Database: uniprot_sprot.human
# Taxon: All
# Contaminant Database: contaminants_maxquant
# Searched Entry: 20488
# FDR Estimation: Enabled
# Merge Options: no merge
# Precursor Options: corrected
# Charge Options: no correction
# Filter Options: no filter
# Process: true
# Associate chimera: no
# ================================================================================
# Step 1.2: Set FDR 1.0%.
# ================================================================================
# The number of MS/MS spectra is "694565", the number of peptide-spectrum matches (PSMs) is "207332", the number of peptide sequences is "26594".
# ================================================================================
# Step 1.3: Right-click on the DB search node "??", select "Deep Denovo Export".
# ================================================================================
# We will get the following 11 pairs of csv and mgf files in the PEAKS project folder:
# export_0.csv, export_0.mgf
# export_1.csv, export_1.mgf
# export_2.csv, export_2.mgf
# export_3.csv, export_3.mgf
# export_4.csv, export_4.mgf
# export_5.csv, export_5.mgf
# export_6.csv, export_6.mgf
# export_7.csv, export_7.mgf
# export_8.csv, export_8.mgf
# export_9.csv, export_9.mgf
# export_10.csv, export_10.mgf
# ================================================================================
# Step 2: Train personalized DeepNovo model.
# ================================================================================
# This step 2 took about 12 hours on a server with GPU Titan X, 32 GB memory
# Note that you will need to specify the paths to your own data and model folders when you run the Python scripts. The following scripts just show examples of my data and model folders.
# ================================================================================
# Step 2.1: Prepare the training data.
# ================================================================================
# Run merge_mgf_file() and merge_feature_file()
# ======================= UNCOMMENT and RUN ======================================
# ~ folder_path = data_training_dir
# ~ fraction_list = range(0, num_fractions)
# ~ merge_mgf_file(
# ~ input_file_list=[folder_path + "export_" + str(i) + ".mgf" for i in fraction_list],
# ~ fraction_list=fraction_list,
# ~ output_file=folder_path + "spectrum.mgf")
# ~ merge_feature_file(
# ~ input_file_list=[folder_path + "export_" + str(i) + ".csv" for i in fraction_list],
# ~ fraction_list=fraction_list,
# ~ output_file=folder_path + "feature.csv")
# ================================================================================
# We will get two output files in the same folder: "spectrum.mgf" and "feature.csv".
# Both functions also report the number of entries that have been processed: "counter = 694565".
# That number should be the same as the total number of MS/MS spectra from the raw files.
# Run split_feature_unlabel()
# ======================= UNCOMMENT and RUN ======================================
# ~ input_feature_file = data_training_dir + "feature.csv"
# ~ split_feature_unlabel(input_feature_file)
# ================================================================================
# It will split the "feature.csv" into 2 files: "feature.csv.labeled" and "feature.csv.unlabeled".
# It also reports the number of labeled and unlabel features: "num_labeled = 207332" and "num_unlabeled = 487233".
# Note that "207332" is also the number of PSMs reported at FDR 1.0% in Step 1.
# Run calculate_mass_shift_ppm() and correct_mass_shift_ppm()
# ======================= UNCOMMENT and RUN ======================================
# ~ labeled_feature_file = data_training_dir + "feature.csv.labeled"
# ~ ppm = calculate_mass_shift_ppm(labeled_feature_file)
# ~ input_feature_file = data_training_dir + "feature.csv.labeled"
# ~ correct_mass_shift_ppm(input_feature_file, ppm)
# ~ input_feature_file = data_training_dir + "feature.csv"
# ~ correct_mass_shift_ppm(input_feature_file, ppm)
# ================================================================================
# The mass shift is calculated from "feature.csv.labeled".
# The mass shift ppm (part per million) is reported as: "mean_precursor_ppm = 7.07514819678".
# Then mass is corrected for 2 files: "feature.csv.labeled.mass_corrected" and "feature.csv.mass_corrected".
# Run split_feature_training_noshare()
# ======================= UNCOMMENT and RUN ======================================
# ~ input_feature_file = data_training_dir + "feature.csv.labeled.mass_corrected"
# ~ proportion = [0.90, 0.05, 0.05]
# ~ split_feature_training_noshare(input_feature_file, proportion)
# ================================================================================
# It will split "feature.csv.labeled.mass_corrected" into train/valid/test sets with "proportion = [0.9, 0.05, 0.05]".
# Those 3 sets do not share common peptides.
# Their sizes are also reported.
# "num_total = 207332"
# "num_unique = 26656"
# "num_train = 185823"
# "num_valid = 10900"
# "num_test = 10609"
# ================================================================================
# Step 2.2: Training DeepNovo model.
# ================================================================================
# Run DeepNovo training
# The training will stop after 10 epoch. The model with best performance on the valid set, "ckpt-16200" is saved in the model folder "train.mel_16.class_1".
# ======================= UNCOMMENT and RUN ======================================
# ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --train"]
# ~ command += ["--train_dir", model_dir]
# ~ command += ["--train_spectrum", data_training_dir + "spectrum.mgf"]
# ~ command += ["--train_feature", data_training_dir + "feature.csv.labeled.mass_corrected.train.noshare"]
# ~ command += ["--valid_spectrum", data_training_dir + "spectrum.mgf"]
# ~ command += ["--valid_feature", data_training_dir + "feature.csv.labeled.mass_corrected.valid.noshare"]
# ~ command += ["--reset_step"]
# ~ command = " ".join(command)
# ~ print(command)
# ~ os.system(command)
# ================================================================================
# Run DeepNovo testing
# ======================= UNCOMMENT and RUN ======================================
# ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test_true_feeding"]
# ~ command += ["--train_dir", model_dir]
# ~ command += ["--test_spectrum", data_training_dir + "spectrum.mgf"]
# ~ command += ["--test_feature", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"]
# ~ command = " ".join(command)
# ~ print(command)
# ~ os.system(command)
# ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --search_denovo"]
# ~ command += ["--train_dir", model_dir]
# ~ command += ["--denovo_spectrum", data_training_dir + "spectrum.mgf"]
# ~ command += ["--denovo_feature", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"]
# ~ command = " ".join(command)
# ~ print(command)
# ~ os.system(command)
# ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test"]
# ~ command += ["--target_file", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"]
# ~ command += ["--predicted_file", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo"]
# ~ command = " ".join(command)
# ~ print(command)
# ~ os.system(command)
# ================================================================================
# The testing accuracy at the amino acid (AA) and peptide levels will be reported as following:
# "precision_AA_mass_db = 0.8425"
# "precision_peptide_mass_db = 0.6430"
# ================================================================================
# Step 3: Perform personalized de novo sequencing with DeepNovo.
# ================================================================================
# This step 3 took about 5 hours on a server with GPU Titan X, 32 GB memory
# Run DeepNovo de novo sequencing on all features (label and unlabeled)
# ======================= UNCOMMENT and RUN ======================================
# ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --search_denovo"]
# ~ command += ["--train_dir", model_dir]
# ~ command += ["--denovo_spectrum", data_training_dir + "spectrum.mgf"]
# ~ command += ["--denovo_feature", data_training_dir + "feature.csv.mass_corrected"]
# ~ command = " ".join(command)
# ~ print(command)
# ~ os.system(command)
# ================================================================================
# The de novo results will be written to the file "feature.csv.mass_corrected.deepnovo_denovo".
# The tool will also report the number of features that have been processed:
# "Total spectra: 694565"
# "read: 690354"
# "skipped: 4211"
# "by mass: 4211"
# ================================================================================
# Step 4: Quality control.
# ================================================================================
# ================================================================================
# Step 4.1: Post-process de novo results to improve their accuracy.
# ================================================================================
# Run select_top_score()
# This script selects a threshold of de novo confidence scores and uses it to filter de novo results.
# The score threshold is calculated based on a 95% cutoff of the testing accuracy obtained at the end of Step 2 above.
# ======================= UNCOMMENT and RUN ======================================
# ~ accuracy_cutoff = 0.95
# ~ accuracy_file = data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare.deepnovo_denovo.accuracy"
# ~ score_cutoff = find_score_cutoff(accuracy_file, accuracy_cutoff)
# ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo"
# ~ output_file = input_file + ".top95"
# ~ select_top_score(input_file, output_file, score_cutoff)
# ================================================================================
# After this step we'll get the file "feature.csv.mass_corrected.deepnovo_denovo.top95".
# The score cutoff and the number of selected features will also be reported:
# "score_cutoff = -0.5"
# "total_feature = 690354"
# "select_feature = 233589"
# Run convert_I_to_L()
# This script converts I (Isoleucine) to L (Leucine) in all de novo peptides, because de novo sequencing is not able to distinguish them.
# ======================= UNCOMMENT and RUN ======================================
# ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95"
# ~ output_file = input_file + ".I_to_L"
# ~ convert_I_to_L(input_file, output_file)
# ================================================================================
# Run correct_by_consensus()
# This script corrects de novo sequencing errors by grouping predicted sequences of the same mass together and voting the consensus sequence.
# ======================= UNCOMMENT and RUN ======================================
# ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L"
# ~ output_file = input_file + ".consensus"
# ~ correct_by_consensus(input_file, output_file)
# ================================================================================
# Run filter_by_minlen()
# This script filters out sequences of length less than 5 amino acids.
# ======================= UNCOMMENT and RUN ======================================
# ~ minlen = 5
# ~ input_file = data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus"
# ~ output_file = input_file + ".minlen" + str(minlen)
# ~ filter_by_minlen(input_file, output_file, minlen)
# ================================================================================
# The numbers of features will be reported as:
# "total_feature = 233589"
# "minlen_feature = 223507"
# "removed_feature = 10082"
# Up to this step, we get the following file:
# "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5"
# We test its accuracy against the test set:
# Run DeepNovo testing
# ======================= UNCOMMENT and RUN ======================================
# ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test"]
# ~ command += ["--target_file", data_training_dir + "feature.csv.labeled.mass_corrected.test.noshare"]
# ~ command += ["--predicted_file", data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5"]
# ~ command = " ".join(command)
# ~ print(command)
# ~ os.system(command)
# ================================================================================
# We get these results:
# "precision_AA_mass_db = 0.9530"
# "precision_peptide_mass_db = 0.8441"
# Repeat the same testing but now against all labeled features:
# Run DeepNovo testing
# ====================== UNCOMMENT and RUN =======================================
# ~ command = ["LD_PRELOAD=\"/usr/lib/libtcmalloc.so\" /usr/bin/time -v python deepnovo_main.py --test"]
# ~ command += ["--target_file", data_training_dir + "feature.csv.labeled.mass_corrected"]
# ~ command += ["--predicted_file", data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5"]
# ~ command = " ".join(command)
# ~ print(command)
# ~ os.system(command)
# ================================================================================
# We get these results:
# "precision_AA_mass_db = 0.9797"
# "precision_peptide_mass_db = 0.9371"
# Note that these accuracy results look better than those against the test set because the test set was not used for training the model.
# The number of de novo only features is also reported as
# "predicted_only: 68721"
# and they are written to the file
# "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5.denovo_only"
# ================================================================================
# Step 4.2: Run second round of PEAKS X DB search against the list of database and de novo peptides.
# ================================================================================
# Before running PEAKS, we need to combine database and de novo peptides into a list.
# This script will select unique de novo peptides, filter out those that belong to the human Swiss-Prot protein database, and combine the remaining de novo peptides and the database peptides identified from Step 1 into a fasta file.
# ======================= UNCOMMENT and RUN ======================================
# ~ aa_workflow_step_4_2.preprocess(
# ~ denovo_file=data_training_dir + "feature.csv.mass_corrected.deepnovo_denovo.top95.I_to_L.consensus.minlen5.denovo_only",
# ~ db_fasta_file=data_fasta_dir + "uniprot_sprot.human.plus_contaminants.fasta",
# ~ labeled_feature_file=data_training_dir + "feature.csv.labeled.mass_corrected",
# ~ peptide_list_fasta=data_training_dir + "aa_workflow.step_4.peptide_list.fasta")
# ================================================================================
# The numbers of de novo and database peptides are reported as following:
# "Number of top-scoring denovo peptides: 17318"
# "num_db_peptides = 25274"
# "num_denovo_peptides = 6444" (not in database)
# Run PEAKS X DB search with as following:
# Select the DENOVO node result from Step 1.1, and select PEAKS DB search;
# Select option "No digestion" for "Digest mode";
# Select the fasta file "aa_workflow.step_4.peptide_list.fasta" as the only database, no contaminant;
# Leave other settings the same as in Step 1.1.
# Set FDR 1.0% and export the "DB search psm.csv" file, rename it to "aa_workflow.step_4.psm.csv".
# Extract de novo peptides from the PSMs of PEAKS X DB search round 2.
# ======================= UNCOMMENT and RUN ======================================
# ~ aa_workflow_step_4_2.postprocess(
# ~ psm_file = data_training_dir + "aa_workflow.step_4.psm.csv",
# ~ output_denovo_peptide_file = data_training_dir + "aa_workflow.step_4.output_peptide_list")
# ================================================================================
# The number of de novo peptides is reported as following:
# "num_denovo_peptides = 1259"
# ================================================================================
# Step 5: Neoantigen selection.
# ================================================================================
# ~ aa_workflow_step_5.step_5(
# ~ psm_file=data_training_dir + "aa_workflow.step_4.psm.csv",
# ~ netmhc_file=data_training_dir + "aa_workflow.step_5.netmhcpan.csv",
# ~ immunogenicity_file=data_training_dir + "aa_workflow.step_5.immunogenicity.csv",
# ~ db_fasta_file=data_fasta_dir + "uniprot_sprot.human.plus_contaminants.fasta",
# ~ labeled_feature_file=data_training_dir + "feature.csv.labeled",
# ~ snp_file=data_training_dir + "aa_workflow.step_5.supp_data5_snp.csv",
# ~ snp_enst_fasta=data_training_dir + "aa_workflow.step_5.supp_data5_snp_enst.fasta",
# ~ snp_sample_id=patient_id,
# ~ output_neoantigen_criteria=data_training_dir + "aa_workflow.step_5.output_neoantigen_criteria.csv",
# ~ output_protein_mutation=data_training_dir + "aa_workflow.step_5.protein_mutation.csv")