First-ever hybrid tool for peptide identification

StSchulze · Nov 29, 2017 · d5bf3b1 · d5bf3b1
1 parent d2b271f
commit d5bf3b1
Show file tree

Hide file tree

Showing 7 changed files with 1,175 additions and 265 deletions.
diff --git a/README.md b/README.md
@@ -20,16 +20,20 @@ More details are available in our publications:
 
 **If you want to use the models in our PNAS paper, please use the branch PNAS**.
 
-## Latest update
+## Latest updates
+
+- The first-ever hybrid tool for peptide identification that integrates de novo
+sequencing and database search into the same scoring and sequencing framework.
+DeepNovo now have three sequencing modes: `search_denovo()`, `search_db()`, and 
+`search_hybrid()`.
+
+- Added decoy database search to estimate False Discovery Rate (FDR). The FDR
+can be used to filter both database search and de novo sequencing results.
 
 - Replaced DecodingModel by ModelInference to make the code of building neural
 network models easy to understand and for further development.
 
-- Updated de novo sequencing function, `decode()`, and database search function,
-`search_db()`, to work with ModelInference.
-
-- What's next: clean up the code of TrainingModel, training and de novo sequencing
-functions.
+- What's next: clean up the code of TrainingModel and training function.
 
 Those updates still work with the pre-trained model, training and testing data
 provided earlier (version 0.0.1 below).

diff --git a/deepnovo_config.py b/deepnovo_config.py
@@ -59,17 +59,33 @@
                             "Set to True for beam search.")
 
 tf.app.flags.DEFINE_integer("beam_size",
-                            1,
+                            5,
                             "Number of optimal paths to search during decoding.")
 
 tf.app.flags.DEFINE_boolean("search_db",
                             False,
-                            "Set to True to perform a database search.")
+                            "Set to True to do a database search.")
+
+tf.app.flags.DEFINE_boolean("search_denovo",
+                            False,
+                            "Set to True to do a denovo search.")
+
+tf.app.flags.DEFINE_boolean("search_hybrid",
+                            False,
+                            "Set to True to do a hybrid, db+denovo, search.")
 
 tf.app.flags.DEFINE_boolean("test",
                             False,
                             "Set to True to test the prediction accuracy.")
 
+tf.app.flags.DEFINE_boolean("header_seq",
+                            True,
+                            "Set to False if peptide sequence is not provided.")
+
+tf.app.flags.DEFINE_boolean("decoy",
+                            False,
+                            "Set to True to search decoy database.")
+
 FLAGS = tf.app.flags.FLAGS
 
 
@@ -373,21 +389,48 @@
 
 # YEAST-LOW-COON_2013-PEAKS-DB-DUP
 data_format = "mgf"
-db_fasta_file = "data/uniprot_sprot.yeast.fasta"
 cleavage_rule = "trypsin"
 num_missed_cleavage = 2
 fixed_mod_list = ['C']
 var_mod_list = ['N', 'Q', 'M']
-mass_tolerance = 0.01 # Da
-ppm = 10.0/1000000 # ppm (20 better) # instead of absolute 0.01 Da
-input_file_train = "data.training/yeast.low.coon_2013/peaks.db.mgf.train.dup"
-input_file_valid = "data.training/yeast.low.coon_2013/peaks.db.mgf.valid.dup"
-input_file_test = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup"
-decode_test_file = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup"
-input_file = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup"
-output_file = FLAGS.train_dir + "/output.deepnovo_db.tab"
-target_file = input_file + ".target"
-predicted_file = output_file
+precursor_mass_tolerance = 0.01 # Da
+precursor_mass_ppm = 10.0/1000000 # ppm (20 better) # instead of absolute 0.01 Da
+knapsack_file = "knapsack.npy"
+# training/testing/decoding files
+#~ input_file_train = "data.training/yeast.low.coon_2013/peaks.db.mgf.train.dup"
+#~ input_file_valid = "data.training/yeast.low.coon_2013/peaks.db.mgf.valid.dup"
+#~ input_file_test = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup"
+#~ decode_test_file = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup"
+#~ input_file_train = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.train.dup"
+#~ input_file_valid = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.valid.dup"
+#~ input_file_test = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.test.dup"
+#~ decode_test_file = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.test.dup"
+#~ input_file_train = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.train.dup"
+#~ input_file_valid = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.valid.dup"
+#~ input_file_test = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.test.dup"
+#~ decode_test_file = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.test.dup"
+input_file_train = "data.training/dia.xchen.nov27/fraction_1.mgf.split.train.dup"
+input_file_valid = "data.training/dia.xchen.nov27/fraction_1.mgf.split.valid.dup"
+input_file_test = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup"
+decode_test_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup"
+# denovo files
+denovo_input_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup"
+denovo_output_file = denovo_input_file + ".deepnovo_denovo"
+# db files
+db_fasta_file = "data/uniprot_sprot.human.fasta"
+db_input_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup"
+db_output_file = db_input_file + ".deepnovo_db"
+if FLAGS.decoy:
+  db_output_file += ".decoy"
+# hybrid files
+hybrid_input_file = "data.training/yeast.low.heinemann_2015/peaks.db.mgf.test.dup"
+hybrid_denovo_file = hybrid_input_file + ".deepnovo_hybrid_denovo"
+hybrid_output_file = hybrid_input_file + ".deepnovo_hybrid"
+if FLAGS.decoy:
+  hybrid_output_file += ".decoy"
+# test accuracy
 predicted_format = "deepnovo"
-accuracy_file = "accuracy.deepnovo_db.tab"
+target_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup.target"
+predicted_file = denovo_output_file
+accuracy_file = predicted_file + ".accuracy"
 # ==============================================================================
diff --git a/deepnovo_main.py b/deepnovo_main.py
@@ -14,6 +14,7 @@
 import deepnovo_config
 import deepnovo_model
 import deepnovo_worker_db
+import deepnovo_worker_denovo
 import deepnovo_worker_io
 import deepnovo_worker_test
 import deepnovo_main_modules
@@ -32,22 +33,47 @@ def main(_):
     deepnovo_main_modules.test_true_feeding()
   elif deepnovo_config.FLAGS.decode:
     deepnovo_main_modules.decode()
+  elif deepnovo_config.FLAGS.search_denovo:
+    model = deepnovo_model.ModelInference()
+    model.build_model()
+    worker_io = deepnovo_worker_io.WorkerIO(
+        input_file=deepnovo_config.denovo_input_file,
+        output_file=deepnovo_config.denovo_output_file)
+    worker_denovo = deepnovo_worker_denovo.WorkerDenovo()
+    worker_denovo.search_denovo(model, worker_io)
   elif deepnovo_config.FLAGS.search_db:
-    #~ model = deepnovo_model.DecodingModel()
     model = deepnovo_model.ModelInference()
     model.build_model()
     worker_io = deepnovo_worker_io.WorkerIO(
-        input_file=deepnovo_config.input_file,
-        output_file=deepnovo_config.output_file)
+        input_file=deepnovo_config.db_input_file,
+        output_file=deepnovo_config.db_output_file)
     worker_db = deepnovo_worker_db.WorkerDB()
     worker_db.build_db()
     worker_db.search_db(model, worker_io)
-    # due to some mistakes in cleavage rules, we need worker_db.peptide_list to
-    #   check for consistency
-    worker_test = deepnovo_worker_test.WorkerTest()
-    worker_test.test_accuracy(worker_db.peptide_list)
+  elif deepnovo_config.FLAGS.search_hybrid:
+    model = deepnovo_model.ModelInference()
+    model.build_model()
+    # denovo search
+    worker_io = deepnovo_worker_io.WorkerIO(
+        input_file=deepnovo_config.hybrid_input_file,
+        output_file=deepnovo_config.hybrid_denovo_file)
+    worker_denovo = deepnovo_worker_denovo.WorkerDenovo()
+    predicted_denovo_list = worker_denovo.search_denovo(model, worker_io)
+    # db search with predicted_denovo_list
+    worker_io = deepnovo_worker_io.WorkerIO(
+        input_file=deepnovo_config.hybrid_input_file,
+        output_file=deepnovo_config.hybrid_output_file)
+    worker_db = deepnovo_worker_db.WorkerDB()
+    worker_db.build_db()
+    worker_db.search_db(model, worker_io, predicted_denovo_list)
   elif deepnovo_config.FLAGS.test:
-    pass
+    # test 1%FDR
+    #~ worker_db = deepnovo_worker_db.WorkerDB()
+    #~ worker_db.build_db()
+    #~ worker_test = deepnovo_worker_test.WorkerTest()
+    #~ worker_test.test_accuracy(worker_db.peptide_list)
+    worker_test = deepnovo_worker_test.WorkerTest()
+    worker_test.test_accuracy()
   else:
     print("ERROR: wrong option!")
     sys.exit()