-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbert.py
120 lines (97 loc) · 5.27 KB
/
bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, AutoModel, BertTokenizerFast
from transformers import Trainer, TrainingArguments
from feature_builder import FeatureBuilder
import numpy as np
import random
import sys
from tensorflow.keras import backend as K
class AcademicDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
item["labels"] = torch.tensor([self.labels[idx]])
return item
def __len__(self):
return len(self.labels)
def load_data(data_name, dimension, data_type, seed=0, num_samples=1000000):
base_dir = '/home/skuzi2/{}_dataset/'.format(data_name)
ids_dir = base_dir + 'data_splits/dim.all.mod.neu.para.1.{}.ids'.format(data_type)
grades_dir = base_dir + 'annotations/annotation_aggregated.tsv'
labels = FeatureBuilder.build_labels(ids_dir, grades_dir)
lines = open(ids_dir.replace('ids', 'text'), 'r').readlines()
x, y = FeatureBuilder.modify_data_to_dimension(lines, labels, dimension, num_samples=num_samples, seed=seed)
y = [float(i) for i in y]
return x, y
def fine_tune_bert(data_name, dimension, max_length, seed=0, num_samples=1000):
model_name = 'allenai/scibert_scivocab_uncased'
print('Initializing Tokenizer')
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
x_data, y_data = load_data(data_name, dimension, 'train', num_samples=num_samples, seed=seed)
x_test, y_test = load_data(data_name, dimension, 'test.val')
print('train size: {}, {}. test size: {}, {}'.format(len(x_data), len(y_data), len(x_test), len(y_test)))
print('Tokenizing')
train_encodings = tokenizer(x_data, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
test_encodings = tokenizer(x_test, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
train_dataset = AcademicDataset(train_encodings, y_data)
print('Loading BERT')
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
load_best_model_at_end=True, # load the best model when finished training (default metric is loss)
)
trainer = Trainer(
model=model, # the instantiated Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
)
print('Fine-tuning')
trainer.train()
print('Finished')
model_path = '../{}_dataset/bert_models_{}/dim.{}.samples.{}'.format(data_name, seed, dimension, num_samples)
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print('Done')
return model, tokenizer, train_encodings, test_encodings
def infer_embeddings(model, tokenizer, lines, output_dir, max_length):
output_file = open(output_dir, 'w+')
for i in range(0, len(lines), 16):
print('infer step: {}'.format(i))
end = min(i+16, len(lines))
encodings = tokenizer(lines[i: end], truncation=True, padding=True, max_length=max_length, return_tensors="pt")
outputs = model(**encodings, output_hidden_states=True)
hidden_states = outputs[1][-1].detach().numpy() # (batch, seq, hidden)
embeddings = np.mean(hidden_states, axis=1) # (batch, hidden)
for l in range(embeddings.shape[0]):
line = ['({}, {})'.format(j, embeddings[l, j]) for j in range(embeddings.shape[1])]
line = '[' + ', '.join(line) + ']\n'
output_file.write(line)
output_file.close()
def main():
data_name = 'iclr17'#sys.argv[1]
grade_dims = {'education': [0, 1, 2, 3, 4, 5, 6], 'iclr17': [1, 2, 3, 5, 6]}[data_name]
max_length = 512
seed = int(sys.argv[1])
samples_option = int(sys.argv[2])
K.clear_session()
samples = {1: [250, 300, 350], 2: [300, 350]}[samples_option]
for num_samples in samples:
K.clear_session()
for dim in grade_dims:
print('{}: {}, {}'.format(data_name, dim, num_samples))
model, tokenizer, _, _ = fine_tune_bert(data_name, dim, max_length, num_samples=num_samples, seed=seed)
print('inferring')
for data_type in ['train', 'test.val']:
output_dir = '../{}_dataset/bert_embeddings_{}/dim.{}.samples.{}.{}'.format(data_name, seed, dim,
num_samples, data_type)
#encodings = train_encodings if data_type == 'train' else test_encodings
data_dir = '/home/skuzi2/{}_dataset/data_splits/dim.all.mod.neu.para.1.{}.text'.format(data_name,
data_type)
lines = open(data_dir, 'r').readlines()
infer_embeddings(model, tokenizer, lines, output_dir, max_length)
if __name__ == '__main__':
main()