-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessor.py
191 lines (161 loc) · 6.14 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
'''
This file aims at preprocessing the facial emotion data; The facial emotion data has 7 classes with 675 lines
and the image information is extracted with the first 5 PCA components by LQP and PHOG descriptor
The label of class is the column 2 and means:
1 Angry
2 Disgust
3 Fear
4 Happy
5 Neutral
6 Sad
7 Surprise
'''
# import libraries
import pandas as pd
import numpy as np
import torch.utils.data
# Hyper parameters
# training_p = 0.8
# validation_p = 0
# testing_p = 0.5
# name of column index
# col_index = ['overview', 'emotion_label', 'LQP_00', 'LQP_01', 'LQP_02', 'LQP_03', 'LQP_04', 'LQP_05', 'PHOG_01', 'PHOG_02', 'PHOG_03', 'PHOG_04', 'PHOG_05']
'''
This class is a preprocessor to load data and split data in SFEW.xlsx (the CPA of images)
There are thress protocols:
SPI: Strictly Person Independent protocol
PPI: Partial Person Independent protocol
SPS: Strictly Person Specific protocol
to partition data into training dataset and testing dataset
'''
class Preprocessor:
def __init__(self):
# load all data from SFEW.xlsx
self.data = pd.read_excel('data/SFEW.xlsx', sheet_name='SFEW.csv')
# The number of data points
self.data_num = self.data.shape[0]
# method to get the person information
def GetPerson(self, names, training_p):
person_list = []
for name in names:
info = name.split("_")
person_info = info[-1]
person = int(person_info.split(".")[0])
person_list.append(person)
person_map = {}
for num in set(person_list):
person_map[num] = person_list.count(num)
# select train data from person no.1, as person no.1 contains all the emotions to ensure the train data
# include all emotion labels
select_threshold = int(training_p * self.data_num)
selected = 0
select_index = []
for key, value in person_map.items():
if selected < select_threshold:
selected += value
select_index.append(key)
else:
break
for p in range(len(person_list)):
if person_list[p] not in select_index:
person_list[p] = False
else:
person_list[p] = True
person_list = np.array(person_list)
return person_list
# Apply SPI (Strictly Person Independent) protocol
def SPI_partition(self, data, training_p = 0.8):
'''
Transform the value of first column to categorical value
i.e. 'Airheads_000519240_00000005.mat', the last information 00000005 is the unique identifier of the person in the image
So, the corresponding numerical value of it is 5
'''
names = data['Unnamed: 0']
data = self.Normalization(data)
person_list = []
for name in names:
info = name.split("_")
person_info = info[-1]
person = int(person_info.split(".")[0])
person_list.append(person)
person_map = {}
for num in set(person_list):
person_map[num] = person_list.count(num)
# select train data from person no.1, as person no.1 contains all the emotions to ensure the train data
# include all emotion labels
select_threshold = int(training_p * self.data_num)
selected = 0
select_index = []
for key, value in person_map.items():
if selected < select_threshold:
selected += value
select_index.append(key)
else:
break
for p in range(len(person_list)):
if person_list[p] not in select_index:
person_list[p] = False
else:
person_list[p] = True
person_list = np.array(person_list)
train_data = data[person_list]
test_data = data[~person_list]
return train_data, test_data
# Apply PPI (Partial Person Independent) protocol
def PPI_partition(self, data, msk):
data = self.Normalization(data)
train_data = data[msk]
test_data = data[~msk]
return train_data, test_data
# Apply SPS (strictly person specific) protocol
def SPS_partition(self, data, training_p=0.8):
names = data['Unnamed: 0']
data = self.Normalization(data)
person_list = []
for name in names:
info = name.split("_")
person_info = info[-1]
person = int(person_info.split(".")[0])
person_list.append(person)
person_map = {}
for num in set(person_list):
person_map[num] = []
for i in range(len(person_list)):
key = person_list[i]
person_map[key].append(data.iloc[i].values)
train_data = []
test_data = []
for key, value in person_map.items():
num_value = len(value)
value = pd.DataFrame(np.concatenate(value).reshape(-1, 11))
msk = np.random.rand(num_value) < training_p
train_data.append(value[msk])
test_data.append(value[~msk])
train_data = pd.concat(train_data)
test_data = pd.concat(test_data)
return train_data, test_data
# Normalize the data
def Normalization (self, data):
# drop the first column
data.drop(data.columns[0], axis=1, inplace=True)
# convert na/nan value to 0
data = data.fillna(0)
# normalize the data except the target (the first column is target)
for column in data:
if column != 'label':
data[column] = data.loc[:, [column]].apply(lambda x: (x - x.mean()) / x.std())
return data
# define a customise torch dataset
class DataFrameDataset(torch.utils.data.Dataset):
def __init__(self, df):
self.data_tensor = torch.Tensor(df.values)
# a function to get items by index
def __getitem__(self, index):
obj = self.data_tensor[index]
input = self.data_tensor[index][1:]
target = self.data_tensor[index][0] - 1
return input, target
# a function to count samples
def __len__(self):
n, _ = self.data_tensor.shape
return n