-
Notifications
You must be signed in to change notification settings - Fork 6
/
mydatasets.py
216 lines (171 loc) · 8.96 KB
/
mydatasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import re
import os
import random
import tarfile
import urllib
import pandas as pd
from torchtext import data
from torchtext.vocab import Vectors
class ResumeDataset(data.Dataset):
@staticmethod
def sort_key(ex):
return len(ex.text)
def __init__(self, text_field, label_field, excel_file, text_path, examples=None):
fields = [('text', text_field), ('label', label_field)]
if examples is None:
examples = []
df = pd.read_excel(excel_file)
# 遍历标注数据的每一行, 读取标签和文件id
for i in range(len(df)):
label = df.loc[i]['标签']
# 如果没有标签,则跳过该条数据
if not isinstance(label, str):
continue
label = label.split('-')[-1]
text_file = text_path + df.loc[i]['文件id'] + '.txt'
with open(text_file, 'r') as f:
text = f.read()
examples += [data.Example.fromlist([text, label], fields)]
super().__init__(examples, fields)
@classmethod
def splits(cls, excel_file, text_path, text_field, label_field, dev_ratio=.1, shuffle=True):
examples = cls(text_field, label_field, excel_file, text_path).examples # 我觉得examples应该是父类data.Dataset的成员变量
#print(len(examples)
if shuffle: random.shuffle(examples)
dev_index = -1 * int(dev_ratio*len(examples))
return (cls(text_field, label_field, excel_file, text_path, examples=examples[:dev_index]),
cls(text_field, label_field, excel_file, text_path, examples=examples[dev_index:])) # 相当于调用了MR类的__init__()方法
def resume(excel_file, text_path, text_field, label_field, batch_size, wv_model=None, use_wv=False, **kargs):
train_data, dev_data = ResumeDataset.splits(excel_file, text_path, text_field, label_field)
#print(train_data, dir(train_data))
#print(dir(train_data.examples[0]))
#print(train_data.examples[0].text)
#os._exit(1)
#label_field.build_vocab(train_data, dev_data)
#text_field.build_vocab([['i'], ['and'], ['you']]) # 要用两层嵌套列表才能导入单词,一层的话是导入字母
if use_wv:
# 使用预训练的word2vec模型初始化词典
model_name = wv_model.split('/')[-1]
path = '/'.join(wv_model.split('/')[:-1])
vectors = Vectors(name=model_name, cache=path)
#text_field.build_vocab([wv_model.wv.vocab.keys()])
text_field.build_vocab([vectors.itos], vectors=vectors) # 导入训练好的word2vec模型
#text_field.vocab.load_vectors(vectors)
else:
# 使用数据集初始化词典
text_field.build_vocab(train_data, dev_data)
label_field.build_vocab(train_data, dev_data)
train_iter, dev_iter = data.Iterator.splits(
(train_data, dev_data),
batch_sizes=(batch_size, len(dev_data)),
**kargs)
return train_iter, dev_iter
class TarDataset(data.Dataset):
"""Defines a Dataset loaded from a downloadable tar archive.
Attributes:
url: URL where the tar archive can be downloaded.
filename: Filename of the downloaded tar archive.
dirname: Name of the top-level directory within the zip archive that
contains the data files.
"""
@classmethod
def download_or_unzip(cls, root):
path = os.path.join(root, cls.dirname)
if not os.path.isdir(path):
tpath = os.path.join(root, cls.filename)
if not os.path.isfile(tpath):
print('downloading')
urllib.request.urlretrieve(cls.url, tpath)
with tarfile.open(tpath, 'r') as tfile:
print('extracting')
tfile.extractall(root)
return os.path.join(path, '')
class MR(TarDataset):
url = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
filename = 'rt-polaritydata.tar'
dirname = 'rt-polaritydata'
# 注释掉这个函数,不能正常运行
@staticmethod
def sort_key(ex):
return len(ex.text)
def __init__(self, text_field, label_field, path=None, examples=None): # 删除掉**kwargs, 运行正常
"""Create an MR dataset instance given a path and fields.
Arguments:
text_field: The field that will be used for text data.
label_field: The field that will be used for label data.
path: Path to the data file.
examples: The examples contain all the data.
Remaining keyword arguments: Passed to the constructor of
data.Dataset.
"""
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip()
text_field.preprocessing = data.Pipeline(clean_str)
fields = [('text', text_field), ('label', label_field)]
if examples is None:
path = self.dirname if path is None else path
examples = []
with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f:
examples += [
data.Example.fromlist([line, 'negative'], fields) for line in f]
with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f:
examples += [
data.Example.fromlist([line, 'positive'], fields) for line in f]
# super(MR, self).__init__(examples, fields, **kwargs) # python2
super().__init__(examples, fields) # python3 可以省略参数. super()是用来调用父类的一个方法, 这里是调用data.Dataset
# # 删除掉**kwargs, 运行正常
@classmethod
def splits(cls, text_field, label_field, dev_ratio=.1, shuffle=True, root='.', **kwargs):
"""Create dataset objects for splits of the MR dataset.
Arguments:
text_field: The field that will be used for the sentence.
label_field: The field that will be used for label data.
dev_ratio: The ratio that will be used to get split validation dataset.
shuffle: Whether to shuffle the data before split.
root: The root directory that the dataset's zip archive will be
expanded into; therefore the directory in whose trees
subdirectory the data files will be stored.
train: The filename of the train data. Default: 'train.txt'.
Remaining keyword arguments: Passed to the splits method of
Dataset.
"""
path = cls.download_or_unzip(root)
examples = cls(text_field, label_field, path=path).examples # 我觉得应该是父类data.Dataset的成员变量
#print(len(examples)
if shuffle: random.shuffle(examples)
dev_index = -1 * int(dev_ratio*len(examples))
return (cls(text_field, label_field, examples=examples[:dev_index]),
cls(text_field, label_field, examples=examples[dev_index:])) # 相当于调用了MR类的__init__()方法
def mr(text_field, label_field, batch_size, **kargs):
train_data, dev_data = MR.splits(text_field, label_field)
#print(train_data, dir(train_data))
#print(dir(train_data.examples[0]))
#print(train_data.examples[0].text)
#os._exit(1)
text_field.build_vocab(train_data, dev_data)
#label_field.build_vocab(train_data, dev_data)
#text_field.build_vocab([['i'], ['and'], ['you']]) # 要用两层嵌套列表才能导入单词,一层的话是导入字母
#text_field.build_vocab([word2vec_model.vocab]) # 用这一行代码导入已有的vocab
label_field.build_vocab(train_data, dev_data)
train_iter, dev_iter = data.Iterator.splits(
(train_data, dev_data),
batch_sizes=(batch_size, len(dev_data)),
**kargs)
return train_iter, dev_iter