forked from nh2tran/DeepNovo
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
6,540 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,393 @@ | ||
# Copyright 2017 Hieu Tran. All Rights Reserved. | ||
# | ||
# DeepNovo is publicly available for non-commercial uses. | ||
# ============================================================================== | ||
|
||
"""TODO(nh2tran): docstring.""" | ||
|
||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
|
||
import numpy as np | ||
import tensorflow as tf | ||
|
||
|
||
# ============================================================================== | ||
# FLAGS (options) for this app | ||
# ============================================================================== | ||
|
||
|
||
tf.app.flags.DEFINE_string("train_dir", # flag_name | ||
"train", # default_value | ||
"Training directory.") # docstring | ||
|
||
tf.app.flags.DEFINE_integer("direction", | ||
2, | ||
"Set to 0/1/2 for Forward/Backward/Bi-directional.") | ||
|
||
tf.app.flags.DEFINE_boolean("use_intensity", | ||
True, | ||
"Set to True to use intensity-model.") | ||
|
||
tf.app.flags.DEFINE_boolean("shared", | ||
False, | ||
"Set to True to use shared weights.") | ||
|
||
tf.app.flags.DEFINE_boolean("use_lstm", | ||
True, | ||
"Set to True to use lstm-model.") | ||
|
||
tf.app.flags.DEFINE_boolean("knapsack_build", | ||
False, | ||
"Set to True to build knapsack matrix.") | ||
|
||
tf.app.flags.DEFINE_boolean("train", | ||
False, | ||
"Set to True for training.") | ||
|
||
tf.app.flags.DEFINE_boolean("test_true_feeding", | ||
False, | ||
"Set to True for testing.") | ||
|
||
tf.app.flags.DEFINE_boolean("decode", | ||
False, | ||
"Set to True for decoding.") | ||
|
||
tf.app.flags.DEFINE_boolean("beam_search", | ||
False, | ||
"Set to True for beam search.") | ||
|
||
tf.app.flags.DEFINE_integer("beam_size", | ||
1, | ||
"Number of optimal paths to search during decoding.") | ||
|
||
tf.app.flags.DEFINE_boolean("search_db", | ||
False, | ||
"Set to True to perform a database search.") | ||
|
||
tf.app.flags.DEFINE_boolean("test", | ||
False, | ||
"Set to True to test the prediction accuracy.") | ||
|
||
FLAGS = tf.app.flags.FLAGS | ||
|
||
|
||
# ============================================================================== | ||
# GLOBAL VARIABLES for VOCABULARY | ||
# ============================================================================== | ||
|
||
|
||
# Special vocabulary symbols - we always put them at the start. | ||
_PAD = "_PAD" | ||
_GO = "_GO" | ||
_EOS = "_EOS" | ||
_START_VOCAB = [_PAD, _GO, _EOS] | ||
|
||
PAD_ID = 0 | ||
GO_ID = 1 | ||
EOS_ID = 2 | ||
|
||
vocab_reverse = ['A', | ||
'R', | ||
'N', | ||
'Nmod', | ||
'D', | ||
#~ 'C', | ||
'Cmod', | ||
'E', | ||
'Q', | ||
'Qmod', | ||
'G', | ||
'H', | ||
'I', | ||
'L', | ||
'K', | ||
'M', | ||
'Mmod', | ||
'F', | ||
'P', | ||
'S', | ||
'T', | ||
'W', | ||
'Y', | ||
'V', | ||
] | ||
|
||
vocab_reverse = _START_VOCAB + vocab_reverse | ||
print("vocab_reverse ", vocab_reverse) | ||
|
||
vocab = dict([(x, y) for (y, x) in enumerate(vocab_reverse)]) | ||
print("vocab ", vocab) | ||
|
||
vocab_size = len(vocab_reverse) | ||
print("vocab_size ", vocab_size) | ||
|
||
|
||
# ============================================================================== | ||
# GLOBAL VARIABLES for THEORETICAL MASS | ||
# ============================================================================== | ||
|
||
|
||
mass_H = 1.0078 | ||
mass_H2O = 18.0106 | ||
mass_NH3 = 17.0265 | ||
mass_N_terminus = 1.0078 | ||
mass_C_terminus = 17.0027 | ||
mass_CO = 27.9949 | ||
|
||
mass_AA = {'_PAD': 0.0, | ||
'_GO': mass_N_terminus-mass_H, | ||
'_EOS': mass_C_terminus+mass_H, | ||
'A': 71.03711, # 0 | ||
'R': 156.10111, # 1 | ||
'N': 114.04293, # 2 | ||
'Nmod': 115.02695, | ||
'D': 115.02694, # 3 | ||
#~ 'C': 103.00919, # 4 | ||
'Cmod': 160.03065, # C(+57.02) | ||
#~ 'Cmod': 161.01919, # C(+58.01) # orbi | ||
'E': 129.04259, # 5 | ||
'Q': 128.05858, # 6 | ||
'Qmod': 129.0426, | ||
'G': 57.02146, # 7 | ||
'H': 137.05891, # 8 | ||
'I': 113.08406, # 9 | ||
'L': 113.08406, # 10 | ||
'K': 128.09496, # 11 | ||
'M': 131.04049, # 12 | ||
'Mmod': 147.0354, | ||
'F': 147.06841, # 13 | ||
'P': 97.05276, # 14 | ||
'S': 87.03203, # 15 | ||
'T': 101.04768, # 16 | ||
'W': 186.07931, # 17 | ||
'Y': 163.06333, # 18 | ||
'V': 99.06841, # 19 | ||
} | ||
|
||
mass_ID = [mass_AA[vocab_reverse[x]] for x in xrange(vocab_size)] | ||
mass_ID_np = np.array(mass_ID, dtype=np.float32) | ||
|
||
mass_AA_min = mass_AA["G"] # 57.02146 | ||
|
||
|
||
# ============================================================================== | ||
# GLOBAL VARIABLES for PRECISION, RESOLUTION, temp-Limits of MASS & LEN | ||
# ============================================================================== | ||
|
||
|
||
# if change, need to re-compile cython_speedup | ||
SPECTRUM_RESOLUTION = 10 # bins for 1.0 Da = precision 0.1 Da | ||
#~ SPECTRUM_RESOLUTION = 20 # bins for 1.0 Da = precision 0.05 Da | ||
#~ SPECTRUM_RESOLUTION = 40 # bins for 1.0 Da = precision 0.025 Da | ||
#~ SPECTRUM_RESOLUTION = 50 # bins for 1.0 Da = precision 0.02 Da | ||
#~ SPECTRUM_RESOLUTION = 80 # bins for 1.0 Da = precision 0.0125 Da | ||
print("SPECTRUM_RESOLUTION ", SPECTRUM_RESOLUTION) | ||
|
||
# if change, need to re-compile cython_speedup | ||
WINDOW_SIZE = 10 # 10 bins | ||
print("WINDOW_SIZE ", WINDOW_SIZE) | ||
|
||
MZ_MAX = 3000.0 | ||
MZ_SIZE = int(MZ_MAX * SPECTRUM_RESOLUTION) # 30k | ||
|
||
KNAPSACK_AA_RESOLUTION = 10000 # 0.0001 Da | ||
mass_AA_min_round = int(round(mass_AA_min * KNAPSACK_AA_RESOLUTION)) # 57.02146 | ||
KNAPSACK_MASS_PRECISION_TOLERANCE = 100 # 0.01 Da | ||
num_position = 0 | ||
|
||
PRECURSOR_MASS_PRECISION_TOLERANCE = 0.01 | ||
|
||
# ONLY for accuracy evaluation | ||
#~ PRECURSOR_MASS_PRECISION_INPUT_FILTER = 0.01 | ||
PRECURSOR_MASS_PRECISION_INPUT_FILTER = 1000 | ||
AA_MATCH_PRECISION = 0.1 | ||
|
||
# skip (x > MZ_MAX,MAX_LEN) | ||
MAX_LEN = 50 if FLAGS.decode else 30 | ||
print("MAX_LEN ", MAX_LEN) | ||
|
||
# We use a number of buckets and pad to the closest one for efficiency. | ||
_buckets = [12, 22, 32] | ||
#~ _buckets = [12,22,32,42,52] # for decode | ||
print("_buckets ", _buckets) | ||
|
||
|
||
# ============================================================================== | ||
# HYPER-PARAMETERS of the NEURAL NETWORKS | ||
# ============================================================================== | ||
|
||
|
||
num_ion = 8 # 2 | ||
print("num_ion ", num_ion) | ||
|
||
l2_loss_weight = 0.0 # 0.0 | ||
print("l2_loss_weight ", l2_loss_weight) | ||
|
||
#~ encoding_cnn_size = 4 * (RESOLUTION//10) # 4 # proportion to RESOLUTION | ||
#~ encoding_cnn_filter = 4 | ||
#~ print("encoding_cnn_size ", encoding_cnn_size) | ||
#~ print("encoding_cnn_filter ", encoding_cnn_filter) | ||
|
||
embedding_size = 512 | ||
print("embedding_size ", embedding_size) | ||
|
||
num_layers = 1 | ||
num_units = 512 | ||
print("num_layers ", num_layers) | ||
print("num_units ", num_units) | ||
|
||
keep_conv = 0.75 | ||
keep_dense = 0.5 | ||
print("keep_conv ", keep_conv) | ||
print("keep_dense ", keep_dense) | ||
|
||
batch_size = 128 | ||
print("batch_size ", batch_size) | ||
|
||
epoch_stop = 20 # 50 | ||
print("epoch_stop ", epoch_stop) | ||
|
||
train_stack_size = 4500 | ||
valid_stack_size = 15000 # 10% | ||
test_stack_size = 4000 | ||
buffer_size = 4000 | ||
print("train_stack_size ", train_stack_size) | ||
print("valid_stack_size ", valid_stack_size) | ||
print("test_stack_size ", test_stack_size) | ||
print("buffer_size ", buffer_size) | ||
|
||
steps_per_checkpoint = 100 # 20 # 100 # 2 # 4 # 200 | ||
random_test_batches = 10 | ||
print("steps_per_checkpoint ", steps_per_checkpoint) | ||
print("random_test_batches ", random_test_batches) | ||
|
||
max_gradient_norm = 5.0 | ||
print("max_gradient_norm ", max_gradient_norm) | ||
|
||
|
||
# ============================================================================== | ||
# DATASETS | ||
# ============================================================================== | ||
|
||
|
||
# ============================================================================== | ||
# YEAST-LOW-EXCLUDE_HEINEMANN_2015-PEAKS-DB-DUP | ||
#~ data_format = "mgf" | ||
#~ input_file_train = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.train.dup" | ||
#~ input_file_valid = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.valid.dup" | ||
#~ input_file_test = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.test.dup" | ||
#~ decode_test_file = "data.training/yeast.low.exclude_heinemann_2015/peaks.db.mgf.test.dup" | ||
# ============================================================================== | ||
|
||
|
||
# ============================================================================== | ||
# YEAST-LOW-EXCLUDE_COON_2013-PEAKS-DB-DUP | ||
#~ data_format = "mgf" | ||
#~ input_file_train = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.train.dup" | ||
#~ input_file_valid = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.valid.dup" | ||
#~ input_file_test = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.test.dup" | ||
#~ decode_test_file = "data.training/yeast.low.exclude_coon_2013/peaks.db.mgf.test.dup" | ||
# ============================================================================== | ||
|
||
|
||
# ============================================================================== | ||
# YEAST-LOW-TAKEDA_2015-PEAKS-DB-DUP | ||
#~ data_format = "mgf" | ||
#~ input_file_train = "data.training/yeast.low.takeda_2015/peaks.db.mgf.train.dup" | ||
#~ input_file_valid = "data.training/yeast.low.takeda_2015/peaks.db.mgf.valid.dup" | ||
#~ input_file_test = "data.training/yeast.low.takeda_2015/peaks.db.mgf.test.dup" | ||
#~ decode_test_file = "data.training/yeast.low.takeda_2015/peaks.db.mgf.test.dup" | ||
# ============================================================================== | ||
|
||
|
||
# ============================================================================== | ||
# YEAST-LOW-PEREDO_2015-PEAKS-DB-DUP | ||
#~ data_format = "mgf" | ||
#~ input_file_train = "data.training/yeast.low.peredo_2015/peaks.db.mgf.train.dup" | ||
#~ input_file_valid = "data.training/yeast.low.peredo_2015/peaks.db.mgf.valid.dup" | ||
#~ input_file_test = "data.training/yeast.low.peredo_2015/peaks.db.mgf.test.dup" | ||
#~ decode_test_file = "data.training/yeast.low.peredo_2015/peaks.db.mgf.test.dup" | ||
# ============================================================================== | ||
|
||
|
||
# ============================================================================== | ||
# YEAST-LOW-HEINEMANN_2015-PEAKS-DB-REPEAT | ||
#~ data_format = "mgf" | ||
#~ input_file_train = "data.training/yeast.low.heinemann_2015/peaks.db.mgf.train.repeat" | ||
#~ input_file_valid = "data.training/yeast.low.heinemann_2015/peaks.db.mgf.valid.repeat" | ||
#~ input_file_test = "data.training/yeast.low.heinemann_2015/peaks.db.mgf.test.repeat" | ||
#~ decode_test_file = "data.training/yeast.low.heinemann_2015/peaks.db.mgf.test.repeat" | ||
|
||
# YEAST-LOW-HEINEMANN_2015-PEAKS-DB-DUP | ||
#~ data_format = "mgf" | ||
#~ input_file_train = "data.training/yeast.low.heinemann_2015/peaks.db.mgf.train.dup" | ||
#~ input_file_valid = "data.training/yeast.low.heinemann_2015/peaks.db.mgf.valid.dup" | ||
#~ input_file_test = "data.training/yeast.low.heinemann_2015/peaks.db.mgf.test.dup" | ||
#~ decode_test_file = "data.training/yeast.low.heinemann_2015/peaks.db.mgf.test.dup" | ||
# ============================================================================== | ||
|
||
|
||
# ============================================================================== | ||
# YEAST-LOW-MANN_2015-PEAKS-DB-REPEAT | ||
#~ data_format = "mgf" | ||
#~ input_file_train = "data.training/yeast.low.mann_2015/peaks.db.mgf.train.repeat" | ||
#~ input_file_valid = "data.training/yeast.low.mann_2015/peaks.db.mgf.valid.repeat" | ||
#~ input_file_test = "data.training/yeast.low.mann_2015/peaks.db.mgf.test.repeat" | ||
#~ decode_test_file = "data.training/yeast.low.mann_2015/peaks.db.mgf.test.repeat" | ||
|
||
# YEAST-LOW-MANN_2015-PEAKS-DB-DUP | ||
#~ data_format = "mgf" | ||
#~ input_file_train = "data.training/yeast.low.mann_2015/peaks.db.mgf.train.dup" | ||
#~ input_file_valid = "data.training/yeast.low.mann_2015/peaks.db.mgf.valid.dup" | ||
#~ input_file_test = "data.training/yeast.low.mann_2015/peaks.db.mgf.test.dup" | ||
#~ decode_test_file = "data.training/yeast.low.mann_2015/peaks.db.mgf.test.dup" | ||
# ============================================================================== | ||
|
||
|
||
# ============================================================================== | ||
# YEAST-LOW-GRANT_2015-PEAKS-DB-REPEAT | ||
#~ data_format = "mgf" | ||
#~ input_file_train = "data.training/yeast.low.grant_2015/peaks.db.mgf.train.repeat" | ||
#~ input_file_valid = "data.training/yeast.low.grant_2015/peaks.db.mgf.valid.repeat" | ||
#~ input_file_test = "data.training/yeast.low.grant_2015/peaks.db.mgf.test.repeat" | ||
#~ decode_test_file = "data.training/yeast.low.grant_2015/peaks.db.mgf.test.repeat" | ||
|
||
# YEAST-LOW-GRANT_2015-PEAKS-DB-DUP | ||
#~ data_format = "mgf" | ||
#~ input_file_train = "data.training/yeast.low.grant_2015/peaks.db.mgf.train.dup" | ||
#~ input_file_valid = "data.training/yeast.low.grant_2015/peaks.db.mgf.valid.dup" | ||
#~ input_file_test = "data.training/yeast.low.grant_2015/peaks.db.mgf.test.dup" | ||
#~ decode_test_file = "data.training/yeast.low.grant_2015/peaks.db.mgf.test.dup" | ||
# ============================================================================== | ||
|
||
|
||
# ============================================================================== | ||
# YEAST-LOW-COON_2013-PEAKS-DB-REPEAT | ||
#~ data_format = "mgf" | ||
#~ input_file_train = "data.training/yeast.low.coon_2013/peaks.db.mgf.train.repeat" | ||
#~ input_file_valid = "data.training/yeast.low.coon_2013/peaks.db.mgf.valid.repeat" | ||
#~ input_file_test = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.repeat" | ||
#~ decode_test_file = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.repeat" | ||
|
||
# YEAST-LOW-COON_2013-PEAKS-DB-DUP | ||
data_format = "mgf" | ||
db_fasta_file = "data/uniprot_sprot.yeast.fasta" | ||
cleavage_rule = "trypsin" | ||
num_missed_cleavage = 2 | ||
fixed_mod_list = ['C'] | ||
var_mod_list = ['N', 'Q', 'M'] | ||
mass_tolerance = 0.01 # Da | ||
ppm = 10.0/1000000 # ppm (20 better) # instead of absolute 0.01 Da | ||
input_file_train = "data.training/yeast.low.coon_2013/peaks.db.mgf.train.dup" | ||
input_file_valid = "data.training/yeast.low.coon_2013/peaks.db.mgf.valid.dup" | ||
input_file_test = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup" | ||
decode_test_file = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup" | ||
input_file = "data.training/yeast.low.coon_2013/peaks.db.mgf.test.dup" | ||
output_file = FLAGS.train_dir + "/output.deepnovo_db.tab" | ||
target_file = input_file + ".target" | ||
predicted_file = output_file | ||
predicted_format = "deepnovo" | ||
accuracy_file = "accuracy.deepnovo_db.tab" | ||
# ============================================================================== |
Oops, something went wrong.