-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathload_data.py
115 lines (100 loc) · 3.84 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
__author__ = '[email protected]'
"""
加载数据
"""
import sys
import codecs
import pickle
import numpy as np
from utils import map_item2id
def load_vocs(paths):
"""
加载vocs
Args:
paths: list of str, voc路径
Returns:
vocs: list of dict
"""
vocs = []
for path in paths:
with open(path, 'rb') as file_r:
vocs.append(pickle.load(file_r))
return vocs
def load_lookup_tables(paths):
"""
加载lookup tables
Args:
paths: list of str, emb路径
Returns:
lookup_tables: list of dict
"""
lookup_tables = []
for path in paths:
with open(path, 'rb', encoding='utf-8') as file_r:
lookup_tables.append(pickle.load(file_r))
return lookup_tables
def init_data(path, feature_names, vocs, max_len, model='train',
use_char_feature=False, word_len=None, sep='\t'):
"""
加载数据(待优化,目前是一次性加载整个数据集)
Args:
path: str, 数据路径
feature_names: list of str, 特征名称
vocs: list of dict
max_len: int, 句子最大长度
model: str, in ('train', 'test')
use_char_feature: bool,是否使用char特征
word_len: None or int,单词最大长度
sep: str, 特征之间的分割符, default is '\t'
Returns:
data_dict: dict
"""
assert model in ('train', 'test') # 限定数据集类型,训练集或测试集
file_r = codecs.open(path, 'r', encoding='utf-8')
sentences = file_r.read().strip().split('\n\n') # 将训练集切为以句子为单位的列表
sentence_count = len(sentences) # 句子数
print "sentence number:", sentence_count
feature_count = len(feature_names)
# 初始化数据集,data_dict存放不同feature和trainset标签的矩阵
data_dict = dict()
for feature_name in feature_names:
data_dict[feature_name] = np.zeros((sentence_count, max_len), dtype='int32')
# char feature
if use_char_feature:
data_dict['char'] = np.zeros(
(sentence_count, max_len, word_len), dtype='int32')
char_voc = vocs.pop(0)
if model == 'train': # 训练集包含标签
data_dict['label'] = np.zeros((len(sentences), max_len), dtype='int32')
for index, sentence in enumerate(sentences):
items = sentence.split('\n') # 取句子的元素(特征,标签)
one_instance_items = [] # 用以分开存放一个句子的特征向量(list),和标签向量(list)
[one_instance_items.append([]) for _ in range(len(feature_names)+1)]
for item in items:
if item ==u"":
continue
feature_tokens = item.split(sep) # 根据数据集中间隔符,将特征和标签分开
for j in range(feature_count):
one_instance_items[j].append(feature_tokens[j])
if model == 'train':
one_instance_items[-1].append(feature_tokens[-1])
for i in range(len(feature_names)):
# print data_dict[feature_names[i]][index]
# 将数据集中各特证和标签,转化为int型id
data_dict[feature_names[i]][index, :] = map_item2id(
one_instance_items[i], vocs[i], max_len)
# print data_dict[feature_names[i]][index]
if use_char_feature:
for i, word in enumerate(one_instance_items[0]):
if i >= max_len:
break
data_dict['char'][index][i, :] = map_item2id(
word, char_voc, word_len)
if model == 'train':
data_dict['label'][index, :] = map_item2id(
one_instance_items[-1], vocs[-1], max_len)
sys.stdout.write('loading data: %d\r' % index)
file_r.close()
return data_dict