forked from pclubiitk/model-zoo
-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataloader.py
129 lines (97 loc) · 4.81 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import tensorflow as tf
import numpy as np
from tensorflow import keras
from keras.utils.np_utils import to_categorical
import json
import h5py
import os
############################################
def right_align(seq,lengths):
# Align the input questions to the right side (pad on left with zeros)
v = np.zeros(np.shape(seq))
N = np.shape(seq)[1]
for i in range(np.shape(seq)[0]):
v[i][N-lengths[i]:N]=seq[i][0:lengths[i]]
return v
#############################################
def read_data(data_img, data_prepro, data_limit):
print("Reading Data...")
img_data = h5py.File(data_img, 'r')
ques_data = h5py.File(data_prepro, 'r')
#Reading upto data limit images
img_data = np.array(img_data['images_train'])
img_pos_train = ques_data['img_pos_train'][:data_limit]
train_img_data = np.array([img_data[_-1,:] for _ in img_pos_train])
# Normalizing images
tem = np.sqrt(np.sum(np.multiply(train_img_data, train_img_data), axis=1))
train_img_data = np.divide(train_img_data, np.transpose(np.tile(tem,(4096,1))))
#shifting padding to left side
ques_train = np.array(ques_data['ques_train'])[:data_limit, :]
ques_length_train = np.array(ques_data['ques_length_train'])[:data_limit]
ques_train = right_align(ques_train, ques_length_train)
train_X = [train_img_data, ques_train]
# All validation answers which are not in training set have been labelled as 1
train_y = to_categorical(ques_data['answers'])[:data_limit, :]
return train_X, train_y
########################################
def get_val_data(val_annotations_path, data_img, data_prepro, data_prepro_meta):
img_data = h5py.File(data_img, 'r')
ques_data = h5py.File(data_prepro, 'r')
metadata = get_metadata(data_prepro_meta)
with open(val_annotations_path, 'r') as an_file:
annotations = json.loads(an_file.read())
img_data = np.array(img_data['images_test'])
img_pos_train = ques_data['img_pos_test']
train_img_data = np.array([img_data[_-1,:] for _ in img_pos_train])
tem = np.sqrt(np.sum(np.multiply(train_img_data, train_img_data), axis=1))
train_img_data = np.divide(train_img_data, np.transpose(np.tile(tem,(4096,1))))
ques_train = np.array(ques_data['ques_test'])
ques_length_train = np.array(ques_data['ques_length_test'])
ques_train = right_align(ques_train, ques_length_train)
# Convert all last indices to 0, as embeddings were made that way
for _ in ques_train:
if 12602 in _:
_[_==12602] = 0
val_X = [train_img_data, ques_train]
ans_to_ix = {str(ans):int(i) for i,ans in metadata['ix_to_ans'].items()}
ques_annotations = {}
for _ in annotations['annotations']:
idx = ans_to_ix.get(_['multiple_choice_answer'].lower())
_['multiple_choice_answer_idx'] = 1 if idx in [None, 1000] else idx
ques_annotations[_['question_id']] = _
abs_val_y = [ques_annotations[ques_id]['multiple_choice_answer_idx'] for ques_id in ques_data['question_id_test']]
abs_val_y = to_categorical(np.array(abs_val_y))
multi_val_y = [list(set([ans_to_ix.get(_['answer'].lower()) for _ in ques_annotations[ques_id]['answers']])) for ques_id in ques_data['question_id_test']]
for i,_ in enumerate(multi_val_y):
multi_val_y[i] = [1 if ans in [None, 1000] else ans for ans in _]
return val_X, abs_val_y, multi_val_y
###############################################
def get_metadata(data_prepro_meta):
meta_data = json.load(open(data_prepro_meta, 'r'))
meta_data['ix_to_word'] = {str(word):int(i) for i,word in meta_data['ix_to_word'].items()}
return meta_data
###############################################
def prepare_embeddings(num_words, embedding_dim, metadata, glove_path, train_questions_path, embedding_matrix_filename):
if os.path.exists(embedding_matrix_filename):
with h5py.File(embedding_matrix_filename) as f:
return np.array(f['embedding_matrix'])
print("Embedding Data...")
with open(train_questions_path, 'r') as qs_file:
questions = json.loads(qs_file.read())
texts = [str(_['question']) for _ in questions['questions']]
embeddings_index = {}
with open(glove_path, 'r') as glove_file:
for line in glove_file:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
embedding_matrix = np.zeros((num_words, embedding_dim))
word_index = metadata['ix_to_word']
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
with h5py.File(embedding_matrix_filename, 'w') as f:
f.create_dataset('embedding_matrix', data=embedding_matrix)
return embedding_matrix