Skip to content

Commit

Permalink
replaced xrange with range, import knapsack file path from config file
Browse files Browse the repository at this point in the history
  • Loading branch information
StSchulze committed May 22, 2019
1 parent 12918b2 commit 0ab36cb
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 73 deletions.
1 change: 1 addition & 0 deletions deepnovo_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@
input_file_valid = "data.training/dia.xchen.nov27/fraction_1.mgf.split.valid.dup"
input_file_test = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup"
decode_test_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup"
decode_output_file = decode_test_file + ".deepnovo_decode"
# denovo files
denovo_input_file = "data.training/dia.xchen.nov27/fraction_1.mgf.split.test.dup"
denovo_output_file = denovo_input_file + ".deepnovo_denovo"
Expand Down
98 changes: 50 additions & 48 deletions deepnovo_main_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,11 @@
#~ cimport numpy as np
#~ ctypedef np.float32_t C_float32
#~ ctypedef np.int32_t C_int32
from six.moves import xrange # pylint: disable=redefined-builtin
# from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf

import deepnovo_config
from deepnovo_config import knapsack_file as knapsack_file
import deepnovo_model
from deepnovo_cython_modules import process_spectrum, get_candidate_intensity

Expand Down Expand Up @@ -266,7 +267,7 @@ def read_spectra(file_handle, data_format, spectra_locations):

candidate_intensity_list_forward = []
prefix_mass = 0.0
for index in xrange(decoder_size):
for index in range(decoder_size):

prefix_mass += deepnovo_config.mass_ID[peptide_ids_forward[index]]
candidate_intensity = get_candidate_intensity(
Expand All @@ -281,7 +282,7 @@ def read_spectra(file_handle, data_format, spectra_locations):

candidate_intensity_list_backward = []
suffix_mass = 0.0
for index in xrange(decoder_size):
for index in range(decoder_size):

suffix_mass += deepnovo_config.mass_ID[peptide_ids_backward[index]]
candidate_intensity = get_candidate_intensity(
Expand Down Expand Up @@ -385,19 +386,19 @@ def get_batch_01(index_list, data_set, bucket_id):
batch_decoder_inputs = []
batch_weights = []
decoder_size = deepnovo_config._buckets[bucket_id]
for length_idx in xrange(decoder_size):
for length_idx in range(decoder_size):

# batch_intensity_inputs and batch_decoder_inputs are just re-indexed.
batch_intensity_inputs.append(
np.array([candidate_intensity_lists[batch_idx][length_idx]
for batch_idx in xrange(batch_size)], dtype=np.float32))
for batch_idx in range(batch_size)], dtype=np.float32))
batch_decoder_inputs.append(
np.array([decoder_inputs[batch_idx][length_idx]
for batch_idx in xrange(batch_size)], dtype=np.int32))
for batch_idx in range(batch_size)], dtype=np.int32))

# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(batch_size, dtype=np.float32)
for batch_idx in xrange(batch_size):
for batch_idx in range(batch_size):
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs[batch_idx][length_idx + 1]
Expand Down Expand Up @@ -448,25 +449,25 @@ def get_batch_2(index_list, data_set, bucket_id):
batch_decoder_inputs_backward = []
batch_weights = []
decoder_size = deepnovo_config._buckets[bucket_id]
for length_idx in xrange(decoder_size):
for length_idx in range(decoder_size):

# batch_intensity_inputs and batch_decoder_inputs are re-indexed.
batch_intensity_inputs_forward.append(
np.array([candidate_intensity_lists_forward[batch_idx][length_idx]
for batch_idx in xrange(batch_size)], dtype=np.float32))
for batch_idx in range(batch_size)], dtype=np.float32))
batch_intensity_inputs_backward.append(
np.array([candidate_intensity_lists_backward[batch_idx][length_idx]
for batch_idx in xrange(batch_size)], dtype=np.float32))
for batch_idx in range(batch_size)], dtype=np.float32))
batch_decoder_inputs_forward.append(
np.array([decoder_inputs_forward[batch_idx][length_idx]
for batch_idx in xrange(batch_size)], dtype=np.int32))
for batch_idx in range(batch_size)], dtype=np.int32))
batch_decoder_inputs_backward.append(
np.array([decoder_inputs_backward[batch_idx][length_idx]
for batch_idx in xrange(batch_size)], dtype=np.int32))
for batch_idx in range(batch_size)], dtype=np.int32))

# Create target_weights to be 0 for targets that are padding.
batch_weight = np.ones(batch_size, dtype=np.float32)
for batch_idx in xrange(batch_size):
for batch_idx in range(batch_size):
# The corresponding target is decoder_input shifted by 1 forward.
if length_idx < decoder_size - 1:
target = decoder_inputs_forward[batch_idx][length_idx + 1]
Expand Down Expand Up @@ -664,9 +665,9 @@ def test_AA_decode_batch(scans,

#~ batch_size = len(decoder_inputs)

for index in xrange(len(scans)):
for index in range(len(scans)):
#~ # for testing
#~ for index in xrange(15,20):
#~ for index in range(15,20):

scan = scans[index]
decoder_input = decoder_inputs[index]
Expand Down Expand Up @@ -757,7 +758,7 @@ def test_logit_batch_01(decoder_inputs, output_logits):
num_exact_match = 0.0
num_len_match = 0.0
batch_size = len(decoder_inputs[0])
for batch in xrange(batch_size):
for batch in range(batch_size):

decoder_input = [x[batch] for x in decoder_inputs]
output_logit = [x[batch] for x in output_logits]
Expand Down Expand Up @@ -800,7 +801,7 @@ def test_logit_batch_2(decoder_inputs_forward,
num_exact_match = 0.0
num_len_match = 0.0
batch_size = len(decoder_inputs_forward[0])
for batch in xrange(batch_size):
for batch in range(batch_size):

decoder_input_forward = [x[batch] for x in decoder_inputs_forward]
decoder_input_backward = [x[batch] for x in decoder_inputs_backward]
Expand Down Expand Up @@ -841,10 +842,10 @@ def test_random_accuracy(sess, model, data_set, bucket_id):
data_set_len = len(data_set[bucket_id])
num_step = deepnovo_config.random_test_batches

for _ in xrange(num_step):
for _ in range(num_step):

start_time = time.time()
random_index_list = random.sample(xrange(data_set_len),
random_index_list = random.sample(range(data_set_len),
deepnovo_config.batch_size)

# get_batch_01/2
Expand Down Expand Up @@ -1044,8 +1045,8 @@ def knapsack_example():
print("mass_aa = ", mass_aa)
knapsack_matrix = np.zeros(shape=(4, 11), dtype=bool)

for aa_id in xrange(4):
for col in xrange(peptide_mass):
for aa_id in range(4):
for col in range(peptide_mass):

current_mass = col + 1

Expand Down Expand Up @@ -1089,13 +1090,13 @@ def knapsack_build():
peptide_mass_upperbound),
dtype=bool)

for aa_id in xrange(3, deepnovo_config.vocab_size): # excluding PAD, GO, EOS
for aa_id in range(3, deepnovo_config.vocab_size): # excluding PAD, GO, EOS

mass_aa_round = int(round(deepnovo_config.mass_ID[aa_id]
* deepnovo_config.KNAPSACK_AA_RESOLUTION))
print(deepnovo_config.vocab_reverse[aa_id], mass_aa_round)

for col in xrange(peptide_mass_upperbound):
for col in range(peptide_mass_upperbound):

# col 0 ~ mass 1
# col + 1 = mass
Expand All @@ -1118,7 +1119,7 @@ def knapsack_build():
else:
knapsack_matrix[aa_id, col] = False

np.save("knapsack.npy", knapsack_matrix)
np.save(knapsack_file, knapsack_matrix)


def knapsack_search(knapsack_matrix, peptide_mass, mass_precision_tolerance):
Expand Down Expand Up @@ -1280,14 +1281,14 @@ def decode_true_feeding_01(sess, model, direction, data_set):
#~ block_state0 = np.vstack(block_state0)

# MAIN decoding LOOP in STACKS
output_log_probs = [[] for x in xrange(len(data_set[0][2]))]
output_log_probs = [[] for x in range(len(data_set[0][2]))]

for stack_index, stack in enumerate(data_set_index_stack_list):

stack_c_state = block_c_state0[stack_index]
stack_h_state = block_h_state0[stack_index]

for index in xrange(len(data_set[0][2])):
for index in range(len(data_set[0][2])):

block_candidate_intensity = np.array([data_set[x][1][index]
for x in stack])
Expand Down Expand Up @@ -1401,7 +1402,7 @@ def decode_beam_select_01(output_top_paths, direction):
LAST_LABEL = deepnovo_config.GO_ID

outputs = []
for entry in xrange(len(output_top_paths)):
for entry in range(len(output_top_paths)):

top_paths = output_top_paths[entry]

Expand Down Expand Up @@ -1498,7 +1499,7 @@ def decode_beam_search_01(sess,
# peptide_mass # 3

# our TARGET
output_top_paths = [[] for x in xrange(data_set_len)]
output_top_paths = [[] for x in range(data_set_len)]

# how many spectra to process at 1 block-run
decode_block_size = deepnovo_config.batch_size
Expand Down Expand Up @@ -1542,7 +1543,7 @@ def decode_beam_search_01(sess,
active_search = []

# fill in the first entries of active_search
for spectrum_id in xrange(decode_block_size):
for spectrum_id in range(decode_block_size):

active_search.append([])
active_search[-1].append(spectrum_id)
Expand Down Expand Up @@ -1685,7 +1686,7 @@ def decode_beam_search_01(sess,

new_paths = []

for index in xrange(block_index,
for index in range(block_index,
block_index + entry_block_size[entry_index]):

for aa_id in block_mass_filter_candidate[index]:
Expand Down Expand Up @@ -1713,7 +1714,7 @@ def decode_beam_search_01(sess,
top_k_indices = np.argpartition(-new_path_scores, deepnovo_config.FLAGS.beam_size)[:deepnovo_config.FLAGS.beam_size] # pylint: disable=line-too-long
#~ top_k_indices = np.argpartition(-new_path_scores/new_path_lengths,deepnovo_config.FLAGS.beam_size)[:deepnovo_config.FLAGS.beam_size] # pylint: disable=line-too-long
entry[1] = [new_paths[top_k_indices[x]]
for x in xrange(deepnovo_config.FLAGS.beam_size)]
for x in range(deepnovo_config.FLAGS.beam_size)]
else:
entry[1] = new_paths[:]

Expand All @@ -1732,7 +1733,7 @@ def decode_beam_search_01(sess,
- active_search_len,
data_set_len)

for spectrum_id in xrange(spectrum_count, new_spectrum_count):
for spectrum_id in range(spectrum_count, new_spectrum_count):
active_search.append([])
active_search[-1].append(spectrum_id)
active_search[-1].append([[[FIRST_LABEL], # current_paths
Expand Down Expand Up @@ -1803,7 +1804,7 @@ def decode_beam_search_2(sess, model, data_set, knapsack_matrix):
argmax_mass_complement_list = []

# by choosing the location of max intensity from (0, peptide_mass_C_location)
for spectrum_id in xrange(data_set_len):
for spectrum_id in range(data_set_len):

peptide_mass = peptide_mass_list[spectrum_id]
peptide_mass_C = peptide_mass - mass_EOS
Expand All @@ -1829,7 +1830,7 @@ def decode_beam_search_2(sess, model, data_set, knapsack_matrix):
argmax_mass_complement_list.append(argmax_mass_complement)

# Add the mass and its complement to candidate_mass_list
for position in xrange(num_position):
for position in range(num_position):

prefix_mass_list = [x[position] for x in argmax_mass_list]
suffix_mass_list = [x[position] for x in argmax_mass_complement_list]
Expand All @@ -1846,7 +1847,7 @@ def decode_beam_search_2(sess, model, data_set, knapsack_matrix):
1000]) # knapsack_precision

# Start decoding for each candidate_mass
output_top_paths = [[] for x in xrange(data_set_len)]
output_top_paths = [[] for x in range(data_set_len)]
for candidate_mass in candidate_mass_list:

top_paths_forward = decode_beam_search_01(
Expand All @@ -1869,7 +1870,7 @@ def decode_beam_search_2(sess, model, data_set, knapsack_matrix):
candidate_mass[3], # knapsack_precision
data_set_backward)

for spectrum_id in xrange(data_set_len):
for spectrum_id in range(data_set_len):

if ((not top_paths_forward[spectrum_id])
or (not top_paths_backward[spectrum_id])): # any list is empty
Expand All @@ -1889,8 +1890,8 @@ def decode_beam_search_2(sess, model, data_set, knapsack_matrix):
#~ return output_top_paths

# Refinement using peptide_mass_list, especially for middle mass
output_top_paths_refined = [[] for x in xrange(data_set_len)]
for spectrum_id in xrange(data_set_len):
output_top_paths_refined = [[] for x in range(data_set_len)]
for spectrum_id in range(data_set_len):
top_paths = output_top_paths[spectrum_id]
for path in top_paths:
seq = path[0]
Expand Down Expand Up @@ -2005,8 +2006,8 @@ def decode(input_file=deepnovo_config.decode_test_file):
# DECODE with BEAM SEARCH
if deepnovo_config.FLAGS.beam_search:

print("Load knapsack_matrix from default: knapsack.npy")
knapsack_matrix = np.load("knapsack.npy")
print("Load knapsack_matrix from default:", knapsack_file)
knapsack_matrix = np.load(knapsack_file)

# READ & DECODE in stacks
print("READ & DECODE in stacks")
Expand All @@ -2024,7 +2025,8 @@ def decode(input_file=deepnovo_config.decode_test_file):
total_peptide_decode = 0.0

# print to output file
decode_output_file = deepnovo_config.FLAGS.train_dir + "/decode_output.tab"
# decode_output_file = deepnovo_config.FLAGS.train_dir + "/decode_output.tab"
decode_output_file = deepnovo_config.decode_output_file
with open(decode_output_file, 'w') as output_file_handle:
print("scan\ttarget_seq\toutput_seq\toutput_score\taccuracy_AA\tlen_AA"
"\texact_match\n",
Expand Down Expand Up @@ -2088,7 +2090,7 @@ def decode(input_file=deepnovo_config.decode_test_file):

# decode_true_feeding each bucket separately, like in training/validation
print("DECODE with TRUE FEEDING")
for bucket_id in xrange(len(deepnovo_config._buckets)):
for bucket_id in range(len(deepnovo_config._buckets)):

if not data_set[bucket_id]: # empty bucket
continue
Expand Down Expand Up @@ -2151,17 +2153,17 @@ def train_cycle(model,
# to select a bucket, length of [scale[i], scale[i+1]] is proportional to
# the size if i-th training bucket, as used later.
train_bucket_sizes = [len(train_set[b])
for b in xrange(len(deepnovo_config._buckets))]
for b in range(len(deepnovo_config._buckets))]
train_total_size = float(sum(train_bucket_sizes))
print("train_bucket_sizes ", train_bucket_sizes)
print("train_total_size ", train_total_size)
train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
for i in xrange(len(train_bucket_sizes))]
for i in range(len(train_bucket_sizes))]
print("train_buckets_scale ", train_buckets_scale)

# to monitor the number of spectra in the current stack
# that have been used for training
train_current_spectra = [0 for b in xrange(len(deepnovo_config._buckets))]
train_current_spectra = [0 for b in range(len(deepnovo_config._buckets))]
# Get a batch and train
while True:

Expand All @@ -2172,7 +2174,7 @@ def train_cycle(model,
# Choose a bucket according to data distribution. We pick a random number
# in [0, 1] and use the corresponding interval in train_buckets_scale.
random_number_01 = np.random.random_sample()
bucket_id = min([i for i in xrange(len(train_buckets_scale))
bucket_id = min([i for i in range(len(train_buckets_scale))
if train_buckets_scale[i] > random_number_01])

# not enough spectra left in this bucket of the current stack
Expand All @@ -2186,7 +2188,7 @@ def train_cycle(model,
break

# Get a RANDOM batch from the current stack and make a step.
random_index_list = random.sample(xrange(train_bucket_sizes[bucket_id]),
random_index_list = random.sample(range(train_bucket_sizes[bucket_id]),
deepnovo_config.batch_size)

# for testing
Expand Down Expand Up @@ -2527,7 +2529,7 @@ def test_true_feeding():
print("Create model for testing")
model = create_model(sess, training_mode=False)

for bucket_id in xrange(len(deepnovo_config._buckets)):
for bucket_id in range(len(deepnovo_config._buckets)):

#~ if valid_set[bucket_id]: # bucket not empty
#~ print("valid_set - bucket {0}".format(bucket_id))
Expand Down
Loading

0 comments on commit 0ab36cb

Please sign in to comment.