Skip to content

Commit

Permalink
First-ever hybrid tool for peptide identification
Browse files Browse the repository at this point in the history
  • Loading branch information
nh2tran committed Nov 29, 2017
1 parent d2b271f commit d5bf3b1
Show file tree
Hide file tree
Showing 7 changed files with 1,175 additions and 265 deletions.
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,20 @@ More details are available in our publications:

**If you want to use the models in our PNAS paper, please use the branch PNAS**.

## Latest update
## Latest updates

- The first-ever hybrid tool for peptide identification that integrates de novo
sequencing and database search into the same scoring and sequencing framework.
DeepNovo now have three sequencing modes: `search_denovo()`, `search_db()`, and
`search_hybrid()`.

- Added decoy database search to estimate False Discovery Rate (FDR). The FDR
can be used to filter both database search and de novo sequencing results.

- Replaced DecodingModel by ModelInference to make the code of building neural
network models easy to understand and for further development.

- Updated de novo sequencing function, `decode()`, and database search function,
`search_db()`, to work with ModelInference.

- What's next: clean up the code of TrainingModel, training and de novo sequencing
functions.
- What's next: clean up the code of TrainingModel and training function.

Those updates still work with the pre-trained model, training and testing data
provided earlier (version 0.0.1 below).
Expand Down
71 changes: 57 additions & 14 deletions deepnovo_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,33 @@
"Set to True for beam search.")

tf.app.flags.DEFINE_integer("beam_size",
1,
5,
"Number of optimal paths to search during decoding.")

tf.app.flags.DEFINE_boolean("search_db",
False,
"Set to True to perform a database search.")
"Set to True to do a database search.")

tf.app.flags.DEFINE_boolean("search_denovo",
False,
"Set to True to do a denovo search.")

tf.app.flags.DEFINE_boolean("search_hybrid",
False,
"Set to True to do a hybrid, db+denovo, search.")

tf.app.flags.DEFINE_boolean("test",
False,
"Set to True to test the prediction accuracy.")

tf.app.flags.DEFINE_boolean("header_seq",
True,
"Set to False if peptide sequence is not provided.")

tf.app.flags.DEFINE_boolean("decoy",
False,
"Set to True to search decoy database.")

FLAGS = tf.app.flags.FLAGS


Expand Down Expand Up @@ -373,21 +389,48 @@

# YEAST-LOW-COON_2013-PEAKS-DB-DUP
data_format = "mgf"
db_fasta_file = "data/uniprot_sprot.yeast.fasta"
cleavage_rule = "trypsin"
num_missed_cleavage = 2
fixed_mod_list = ['C']
var_mod_list = ['N', 'Q', 'M']
mass_tolerance = 0.01 # Da
ppm = 10.0/1000000 # ppm (20 better) # instead of absolute 0.01 Da
input_file_train = "data.training/yeast.low.coon_2013/peaks.db.mgf.train.dup"
input_file_valid = "data.training/yeast.low.coon_2013/peaks.db.mgf.valid.dup"
input_file_test = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup"
decode_test_file = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup"
input_file = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup"
output_file = FLAGS.train_dir + "/output.deepnovo_db.tab"
target_file = input_file + ".target"
predicted_file = output_file
precursor_mass_tolerance = 0.01 # Da
precursor_mass_ppm = 10.0/1000000 # ppm (20 better) # instead of absolute 0.01 Da
knapsack_file = "knapsack.npy"
# training/testing/decoding files
#~ input_file_train = "data.training/yeast.low.coon_2013/peaks.db.mgf.train.dup"
#~ input_file_valid = "data.training/yeast.low.coon_2013/peaks.db.mgf.valid.dup"
#~ input_file_test = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup"
#~ decode_test_file = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup"
#~ input_file_train = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.train.dup"
#~ input_file_valid = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.valid.dup"
#~ input_file_test = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.test.dup"
#~ decode_test_file = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.test.dup"
#~ input_file_train = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.train.dup"
#~ input_file_valid = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.valid.dup"
#~ input_file_test = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.test.dup"
#~ decode_test_file = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.test.dup"
input_file_train = "data.training/dia.xchen.nov27/fraction_1.mgf.split.train.dup"
input_file_valid = "data.training/dia.xchen.nov27/fraction_1.mgf.split.valid.dup"
input_file_test = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup"
decode_test_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup"
# denovo files
denovo_input_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup"
denovo_output_file = denovo_input_file + ".deepnovo_denovo"
# db files
db_fasta_file = "data/uniprot_sprot.human.fasta"
db_input_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup"
db_output_file = db_input_file + ".deepnovo_db"
if FLAGS.decoy:
db_output_file += ".decoy"
# hybrid files
hybrid_input_file = "data.training/yeast.low.heinemann_2015/peaks.db.mgf.test.dup"
hybrid_denovo_file = hybrid_input_file + ".deepnovo_hybrid_denovo"
hybrid_output_file = hybrid_input_file + ".deepnovo_hybrid"
if FLAGS.decoy:
hybrid_output_file += ".decoy"
# test accuracy
predicted_format = "deepnovo"
accuracy_file = "accuracy.deepnovo_db.tab"
target_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup.target"
predicted_file = denovo_output_file
accuracy_file = predicted_file + ".accuracy"
# ==============================================================================
42 changes: 34 additions & 8 deletions deepnovo_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import deepnovo_config
import deepnovo_model
import deepnovo_worker_db
import deepnovo_worker_denovo
import deepnovo_worker_io
import deepnovo_worker_test
import deepnovo_main_modules
Expand All @@ -32,22 +33,47 @@ def main(_):
deepnovo_main_modules.test_true_feeding()
elif deepnovo_config.FLAGS.decode:
deepnovo_main_modules.decode()
elif deepnovo_config.FLAGS.search_denovo:
model = deepnovo_model.ModelInference()
model.build_model()
worker_io = deepnovo_worker_io.WorkerIO(
input_file=deepnovo_config.denovo_input_file,
output_file=deepnovo_config.denovo_output_file)
worker_denovo = deepnovo_worker_denovo.WorkerDenovo()
worker_denovo.search_denovo(model, worker_io)
elif deepnovo_config.FLAGS.search_db:
#~ model = deepnovo_model.DecodingModel()
model = deepnovo_model.ModelInference()
model.build_model()
worker_io = deepnovo_worker_io.WorkerIO(
input_file=deepnovo_config.input_file,
output_file=deepnovo_config.output_file)
input_file=deepnovo_config.db_input_file,
output_file=deepnovo_config.db_output_file)
worker_db = deepnovo_worker_db.WorkerDB()
worker_db.build_db()
worker_db.search_db(model, worker_io)
# due to some mistakes in cleavage rules, we need worker_db.peptide_list to
# check for consistency
worker_test = deepnovo_worker_test.WorkerTest()
worker_test.test_accuracy(worker_db.peptide_list)
elif deepnovo_config.FLAGS.search_hybrid:
model = deepnovo_model.ModelInference()
model.build_model()
# denovo search
worker_io = deepnovo_worker_io.WorkerIO(
input_file=deepnovo_config.hybrid_input_file,
output_file=deepnovo_config.hybrid_denovo_file)
worker_denovo = deepnovo_worker_denovo.WorkerDenovo()
predicted_denovo_list = worker_denovo.search_denovo(model, worker_io)
# db search with predicted_denovo_list
worker_io = deepnovo_worker_io.WorkerIO(
input_file=deepnovo_config.hybrid_input_file,
output_file=deepnovo_config.hybrid_output_file)
worker_db = deepnovo_worker_db.WorkerDB()
worker_db.build_db()
worker_db.search_db(model, worker_io, predicted_denovo_list)
elif deepnovo_config.FLAGS.test:
pass
# test 1%FDR
#~ worker_db = deepnovo_worker_db.WorkerDB()
#~ worker_db.build_db()
#~ worker_test = deepnovo_worker_test.WorkerTest()
#~ worker_test.test_accuracy(worker_db.peptide_list)
worker_test = deepnovo_worker_test.WorkerTest()
worker_test.test_accuracy()
else:
print("ERROR: wrong option!")
sys.exit()
Expand Down
Loading

0 comments on commit d5bf3b1

Please sign in to comment.