forked from soujanyaporia/contextual-multimodal-fusion
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_data.py
127 lines (89 loc) · 4.23 KB
/
create_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
## Credits: https://github.com/SenticNet/contextual-utterance-level-multimodal-sentiment-analysis ##
## Authors: Devamanyu Hazarika, Soujanya Poria ##
## Modified for Python 3.5 ##
import numpy as np, pandas as pd
from collections import defaultdict
import pickle
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
np.random.seed(17)
pre_data = np.asarray(pd.read_csv("./data/transcripts.csv" , header=None))
train = pd.read_csv("./data/text_train.csv", header=None)
test = pd.read_csv("./data/text_test.csv", header=None)
train = np.asarray(train)
test = np.asarray(test)
train_index = np.asarray(train[:,0], dtype = 'int')
test_index = np.asarray(test[:,0], dtype = 'int')
def main(name):
path = "./data/"+name+"/"+name
print (path)
train_video_mapping=defaultdict(list)
train_video_mapping_index=defaultdict(list)
test_video_mapping=defaultdict(list)
test_video_mapping_index=defaultdict(list)
data_train = np.asarray(pd.read_csv(path+"_train0.csv", header=None))
data_test = np.asarray(pd.read_csv(path+"_test0.csv", header=None))
for i in range(train_index.shape[0]):
train_video_mapping[pre_data[train_index[i]][0].rsplit("_",1)[0] ].append(train_index[i])
train_video_mapping_index[pre_data[train_index[i]][0].rsplit("_",1)[0] ].append( int(pre_data[train_index[i]][0].rsplit("_",1)[1]) )
for i in range(test_index.shape[0]):
test_video_mapping[pre_data[test_index[i]][0].rsplit("_",1)[0] ].append(test_index[i])
test_video_mapping_index[pre_data[test_index[i]][0].rsplit("_",1)[0] ].append( int(pre_data[test_index[i]][0].rsplit("_",1)[1]) )
train_indices = dict((c, i) for i, c in enumerate(train_index))
test_indices = dict((c, i) for i, c in enumerate(test_index))
max_len = 0
for key,value in train_video_mapping.items():
max_len = max(max_len , len(value))
for key,value in test_video_mapping.items():
max_len = max(max_len, len(value))
pad = np.asarray([0 for i in range(data_train[0][:-1].shape[0])])
print ("Mapping train")
train_data_X =[]
train_data_Y =[]
train_length =[]
for key,value in train_video_mapping.items():
lst = np.column_stack((train_video_mapping_index[key],value) )
ind = np.asarray(sorted(lst,key=lambda x: x[0]))
lst_X, lst_Y=[],[]
ctr=0;
for i in range(ind.shape[0]):
ctr+=1
#lst_X.append(preprocessing.scale( min_max_scaler.fit_transform(data_train[train_indices[ind[i,1]]][:-1])))
lst_X.append(data_train[train_indices[ind[i,1]]][:-1])
lst_Y.append(data_train[train_indices[ind[i,1]]][-1])
train_length.append(ctr)
for i in range(ctr, max_len):
lst_X.append(pad)
lst_Y.append(0) #dummy label
train_data_X.append(lst_X)
train_data_Y.append(lst_Y)
test_data_X =[]
test_data_Y =[]
test_length =[]
print ("Mapping test")
for key,value in test_video_mapping.items():
lst = np.column_stack((test_video_mapping_index[key],value) )
ind = np.asarray(sorted(lst,key=lambda x: x[0]))
lst_X, lst_Y=[],[]
ctr=0
for i in range(ind.shape[0]):
ctr+=1
#lst_X.append(preprocessing.scale( min_max_scaler.transform(data_test[test_indices[ind[i,1]]][:-1])))
lst_X.append(data_test[test_indices[ind[i,1]]][:-1])
lst_Y.append(data_test[test_indices[ind[i,1]]][-1])
test_length.append(ctr)
for i in range(ctr, max_len):
lst_X.append(pad)
lst_Y.append(0) #dummy label
test_data_X.append(np.asarray(lst_X))
test_data_Y.append(np.asarray(lst_Y))
train_data_X = np.asarray(train_data_X)
test_data_X = np.asarray(test_data_X)
print (train_data_X.shape, test_data_X.shape,len(train_length), len(test_length))
print ("Dumping data")
with open('./input/'+name+'.pickle', 'wb') as handle:
pickle.dump((train_data_X, np.asarray(train_data_Y), test_data_X, np.asarray(test_data_Y), max_len ,train_length, test_length), handle, protocol=pickle.HIGHEST_PROTOCOL)
if __name__ == "__main__":
names = ['text','audio','video']
for nm in names:
main(nm)