-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
113 lines (90 loc) · 3.63 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import random
import jieba
import time
def preprocess(data_ratio=1.,
test_ratio=0.1,
train_data_save='train_data_1.txt',
test_data_save='test_data_1.txt',
save_vocabulary=False):
"""
train test split + vocabulary generation
Parameters
----------
data_ratio: Only use data at a specified ratio. Default: 0.2 .
test_ratio: Test set ratio. Default: 0.1 .
train_data_save: train data save path.
test_data_save: test data save path.
save_vocabulary: if save vocabulary according to specified data ratio.
"""
data_path = 'data/meta_data/weibo_senti_100k.csv'
stopword_path = 'data/meta_data/hit_stopwords.txt'
train_data_save = os.path.join('data', train_data_save)
test_data_save = os.path.join('data', test_data_save)
f_train = open(train_data_save, 'w', encoding='UTF-8')
f_test = open(test_data_save, 'w', encoding='UTF-8')
min_seq = 1 # filter short sentence
top_n = 1000
UNK = '<UNK>'
PAD = '<PAD>'
stop_words = open(stopword_path, 'r', encoding='UTF-8').readlines()
stop_words = [line.strip() for line in stop_words]
stop_words.append(' ')
voc_dict = {}
print('start processing ...')
c1 = time.time()
with open(data_path, 'r', encoding='UTF-8') as f:
f.readline()
total_data = f.readlines()
random.shuffle(total_data)
data_size = int(len(total_data) * data_ratio)
test_size = int(data_size * test_ratio)
train_size = data_size - test_size
print('test ratio:', test_ratio)
print(f'train size:{train_size} test size:{test_size}')
for idx, line in enumerate(total_data):
line = line.strip()
label, content = line[0], line[2:]
seg_list = jieba.cut(content, cut_all=False)
seg_items = []
for word in seg_list:
if word in stop_words:
continue
seg_items.append(word)
if not save_vocabulary:
continue
if word in voc_dict.keys():
voc_dict[word] = voc_dict[word] + 1
else:
voc_dict[word] = 1
if idx < test_size: # save sentence to test data
f_test.write('{},{}\n'.format(label, ' '.join(seg_items)))
continue
f_train.write('{},{}\n'.format(label, ' '.join(seg_items)))
if idx == data_size - 1:
break
f_train.close()
f_test.close()
if save_vocabulary:
print('train test data split successfully')
voc_dict = sorted([_ for _ in voc_dict.items() if _[1] > min_seq],
key=lambda x: x[1],
reverse=True)[: top_n]
voc_dict = {voc[0]: idx for idx, voc in enumerate(voc_dict)}
voc_dict.update({UNK: len(voc_dict), PAD: len(voc_dict) + 1})
print(voc_dict)
# save vocabulary dictionary
dict_save = 'data/voc_dict_{}.txt'.format(data_ratio)
with open(dict_save, 'w') as fw:
for key, value in voc_dict.items():
fw.write('{},{}\n'.format(key, value))
print('save vocabulary dict at data/voc_dict_{}.txt'.format(data_ratio))
print('total time cost: %.1fs' % (time.time() - c1))
if __name__ == '__main__':
data_ratio = 1
train_data = 'train_data_{}.txt'.format(data_ratio)
test_data = 'test_data_{}.txt'.format(data_ratio)
preprocess(data_ratio=data_ratio,
train_data_save=train_data,
test_data_save=test_data,
save_vocabulary=True)