-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
112 lines (93 loc) · 3.47 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# coding: UTF-8
import os
import torch
import pickle as pkl
UNK, PAD = '<UNK>', '<PAD>' # 未知字,padding符号
class Tokenizer():
def __init__(self, datasets=None, vocab_path=None):
if vocab_path is not None:
vocab = pkl.load(open(vocab_path, 'rb'))
elif datasets is not None:
vocab = self.build_vocab(datasets)
pkl.dump(vocab, open('new_vocab.pkl', 'wb'))
self.vocab = vocab
print(f"Vocab size: {len(vocab)}")
def build_vocab(self, datasets):
word_to_id = {}
for dataset in datasets:
with open(dataset, 'r', encoding='utf-8') as f:
for line in f:
content, _ = line.strip().split('\t')
for word in content:
if word not in word_to_id:
word_to_id[word] = len(word_to_id)
word_to_id.update({UNK: len(word_to_id), PAD: len(word_to_id) + 1})
return word_to_id
def encode(self, sentence, max_length):
tokens = [self.vocab.get(token, self.vocab[UNK]) for token in sentence]
if len(tokens) > max_length:
tokens = tokens[:max_length]
final_tokens = tokens + [self.vocab[PAD]] * (max_length - len(tokens))
input_ids = final_tokens
attention_mask = [1] * len(tokens) + [0] * (max_length - len(tokens))
token_type_ids = [0] * max_length
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'token_type_ids': token_type_ids
}
def __call__(self, sentence, max_length=2048):
return self.encode(sentence, max_length)
class Analysis():
def __init__(self):
self.train_record = {}
self.eval_record = {}
self.model_record = {}
'''
append data record of train
train_record_item: dict
'''
def append_train_record(self, train_record_item):
for key in train_record_item:
if key not in self.train_record:
self.train_record[key] = []
self.train_record[key].append(train_record_item[key])
'''
append data record of eval
eval_record_item: dict
'''
def append_eval_record(self, eval_record_item):
for key in eval_record_item:
if key not in self.eval_record:
self.eval_record[key] = []
self.eval_record[key].append(eval_record_item[key])
'''
append data record of model
uid: model uid
'''
def append_model_record(self, uid):
key = "model_uid"
if key not in self.model_record:
self.model_record[key] = []
self.model_record[key].append(uid)
def save_all_records(self, uid):
self.save_record('train_record', uid)
self.save_record('eval_record', uid)
self.save_record('model_record', uid)
def save_record(self, record_name, uid):
record_dict = getattr(self, record_name)
path = f'./data_record/{uid}'
if not os.path.exists(path):
os.makedirs(path)
head = []
for key in record_dict:
head.append(key)
result = ''
for idx in range(len(record_dict[head[0]])):
for key in head:
result += str(record_dict[key][idx]) + '\t'
result += '\n'
result = "\t".join(head) + '\n' + result
with open(f'{path}/{record_name}.csv', encoding='utf-8', mode='w+') as f:
f.write(result)
return uid