forked from renjunxiang/oqmrc_2018
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vec2weight.py
28 lines (22 loc) · 965 Bytes
/
vec2weight.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import pickle
import numpy as np
from competition.data_deal import gitvec2array
vec_names = ['baidubaike', 'renmin', 'weibo', 'wiki', 'zhihu']
for vec_name in vec_names:
path_vec = './vec_all/%s.pkl' % vec_name
print('start ',vec_name)
cut_level = 'word'
path_tokenizer = './data_transform_0/%s/80000/train_tokenizer.pkl' % cut_level
path_weight = './vec_all/%s/weight_%s.npy' % (cut_level, vec_name)
vec_all_array = gitvec2array(path_vec, path_tokenizer)
np.save(path_weight, vec_all_array)
cut_level = 'char'
path_tokenizer = './data_transform_0/%s/8000/train_tokenizer.pkl' % cut_level
path_weight = './vec_all/%s/weight_%s.npy' % (cut_level, vec_name)
vec_all_array = gitvec2array(path_vec, path_tokenizer)
np.save(path_weight, vec_all_array)
print('finish ',vec_name)
# with open(path_tokenizer, mode='rb') as f:
# tokenizer = pickle.load(f)
# len(tokenizer.word_index)
# len(tokenizer.index_word)