-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsvmdataset.py
88 lines (76 loc) · 3.89 KB
/
svmdataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import sys
import numpy as np
import tensorflow as tf
import pickle
from functools import reduce
import os
from preprocessing.preprocess import answer_span_to_indices
# custom imports
from preprocessing.dataset import Dataset
from network.config import CONFIG
from network.build_model import get_batch
from evaluation_metrics import get_f1_from_tokens, get_exact_match_from_tokens
# Suppress tensorflow verboseness
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
D = Dataset(CONFIG.EMBEDDING_FILE)
index2embedding = D.index2embedding
#padded_data_squad1, (max_length_question, max_length_context) = D.load_questions('data/train.json')
#padded_data_validation = padded_data_squad1[(int) (CONFIG.TRAIN_PERCENTAGE*len(padded_data_squad1)):]
#untrained_contexts = [x["context"] for x in padded_data_validation]
#print("Loaded data from squad one")
padded_data_squad2, (max_length_question_squad2, max_length_context_squad2) = D.load_questions('data/train-v2.0.json')
print("padded_data_squad2.len = ",len(padded_data_squad2))
print("Max length from squad 2 q and c: ", max_length_question_squad2, max_length_context_squad2)
print("Loaded data from squad two")
'''
padded_data_untrained = [x for x in padded_data_squad2 if x["context"] in untrained_contexts]
unanswerable_data = [x for x in padded_data_untrained if x["answer_start"]==-1]
answerable_data = [x for x in padded_data_untrained if x["answer_start"]>=0]
print("Number of unanswerable questions: ",len(unanswerable_data))
print("Number of answerable questions: ", len(answerable_data))
padded_data = np.array(padded_data_untrained)
'''
padded_data = np.array(padded_data_squad2)
padded_data = padded_data[(int) ((CONFIG.TRAIN_PERCENTAGE)*len(padded_data_squad2)) : ]
print(padded_data.shape)
latest_checkpoint_path = './modelv2/saved-4'
print("restoring from "+latest_checkpoint_path)
saver = tf.train.import_meta_graph(latest_checkpoint_path+'.meta')
config = tf.ConfigProto()
if '--noGPU' in sys.argv[1:]:
print("Not using the GPU...")
config = tf.ConfigProto(device_count = {'GPU': 0})
with tf.Session(config=config) as sess:
saver.restore(sess, latest_checkpoint_path)
graph = tf.get_default_graph()
alphas_predict = graph.get_tensor_by_name("alphas:0")
betas_predict = graph.get_tensor_by_name("betas:0")
question_batch_placeholder = graph.get_tensor_by_name("question_batch_ph:0")
context_batch_placeholder = graph.get_tensor_by_name("context_batch_ph:0")
embedding = graph.get_tensor_by_name("embedding_ph:0")
dropout_keep_rate = graph.get_tensor_by_name("dropout_keep_ph:0")
alpha_raw, beta_raw, labels = [], [], []
for iteration in range(0, len(padded_data) - CONFIG.BATCH_SIZE, CONFIG.BATCH_SIZE):
# running on an example batch to debug encoder
batch = padded_data[iteration : (iteration + CONFIG.BATCH_SIZE)]
question_batch, context_batch, answer_start_batch_actual, answer_end_batch_actual = get_batch(batch, CONFIG.BATCH_SIZE, max_length_question_squad2, max_length_context_squad2)
print("Iteration: ", iteration , "out of ", len(padded_data))
alphas, betas = sess.run([alphas_predict, betas_predict], feed_dict={
question_batch_placeholder: question_batch,
context_batch_placeholder: context_batch,
embedding: index2embedding,
dropout_keep_rate: 1.0
})
for i in range(answer_start_batch_actual.shape[0]):
alpha_raw.append((alphas[-1][i][:]))
beta_raw.append((betas[-1][i][:]))
if answer_start_batch_actual[i] == -1:
labels.append(-1)
else:
labels.append(1)
with open('./SVMdata/alphas_rawv2.pkl', 'wb') as f:
pickle.dump(alpha_raw, f, protocol=3)
with open('./SVMdata/betas_rawv2.pkl', 'wb') as f:
pickle.dump(beta_raw, f, protocol=3)
with open('./SVMdata/labelsv2.pkl', 'wb') as f:
pickle.dump(labels, f, protocol = 3)