-
Notifications
You must be signed in to change notification settings - Fork 0
/
speech_bart.py
259 lines (192 loc) · 10.5 KB
/
speech_bart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
model.to(device)
# Load and preprocess CSV data
def preprocess_csv_data(data, max_seq_length):
# Initialize BART tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
# Lists to store tokenized input and target sequences
input_ids = []
attention_masks = []
target_ids = []
references = []
source_texts = []
target_texts = []
# Process each row in the DataFrame
for index, row in data.iterrows():
# Tokenize source (input) text
source_text = row["Speech"]
source_texts.append(source_text)
encoded_input = tokenizer.encode_plus(source_text, max_length=max_seq_length,
padding="max_length", truncation=True,
return_tensors="pt")
input_ids.append(encoded_input["input_ids"])
attention_masks.append(encoded_input["attention_mask"])
# Tokenize target (translation) text
target_text = row["LTL"]
target_texts.append(target_text)
encoded_target = tokenizer.encode_plus(target_text, max_length=max_seq_length,
padding="max_length", truncation=True,
return_tensors="pt")
target_ids.append(encoded_target["input_ids"])
references.append([target_text])
# Convert lists to PyTorch tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
target_ids = torch.cat(target_ids, dim=0)
return input_ids, attention_masks, target_ids, source_texts, target_texts
dataset = np.load('data/NatSGD_v1.0.npz', allow_pickle=True)
data = dataset['data']
cols = dataset['fields']
data = pd.DataFrame(data[3,6], columns=["Speech", "LTL"])
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42) # Split data into training and validation sets
batch_size = 8
input_ids, attention_masks, target_ids, train_source, train_target = preprocess_csv_data(train_data, max_seq_length = 512)
train_data_tensor = TensorDataset(input_ids, attention_masks, target_ids)
train_loader = DataLoader(train_data_tensor, batch_size=batch_size, shuffle=False)
test_input_ids, test_attention_masks, test_target_ids, test_source, test_target = preprocess_csv_data(test_data, max_seq_length = 512)
test_data_tensor = TensorDataset(test_input_ids, test_attention_masks, test_target_ids)
test_loader = DataLoader(test_data_tensor, batch_size=batch_size, shuffle=False)
num_epochs = 100
learning_rate = 1e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()
def jaccard_similarity(set1, set2):
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
return intersection / union if union != 0 else 0.0
def calculate_cosine_similarity(predictions, references):
# Ensure predictions and references have the same length
if len(predictions) != len(references):
raise ValueError("Input lists must have the same length")
# Join the lists of strings into single strings
predictions = [" ".join(prediction) for prediction in predictions]
references = [" ".join(reference) for reference in references]
# Initialize a TF-IDF vectorizer with text preprocessing
tfidf_vectorizer = TfidfVectorizer(lowercase=True, tokenizer=str.split)
# Fit and transform the predictions and references
prediction_vectors = tfidf_vectorizer.fit_transform(predictions)
reference_vectors = tfidf_vectorizer.transform(references)
# Calculate cosine similarity
similarity_scores = cosine_similarity(prediction_vectors, reference_vectors)
return similarity_scores
#Scores file path and write headers
modality = 'speech'
model_ver = 'bart'
train_scores_file = os.getcwd() + '/results/scores/speechBart/train/train_score_' + modality +'-'+ model_ver + '.txt'
test_scores_file = os.getcwd() + '/results/scores/speechBart/test/test_score_' + modality +'-'+ model_ver + '.txt'
with open(train_scores_file, 'a') as f:
f.write("train_jaq_similiarity_score, train_cosine_similarity\n")
with open(test_scores_file, 'a') as f:
f.write("test_jaq_similiarity_score, test_cosine_similarity\n")
# Training loop
train_loss_lis = []
test_loss_lis = []
train_data_by_epoch = []
test_data_by_epoch = []
train_score_lis = []
test_score_lis = []
for epoch in range(num_epochs):
train_data = []
test_data = []
total_loss = 0
train_predictions = []
train_references = []
model.train()
for batch in train_loader:
input_ids_batch, attn_mask_batch, target_ids_batch = batch
input_ids_batch, attn_mask_batch, target_ids_batch = \
input_ids_batch.to(device), attn_mask_batch.to(device), target_ids_batch.to(device)
outputs = model(input_ids=input_ids_batch, attention_mask=attn_mask_batch, labels=target_ids_batch)
loss = outputs.loss
total_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Convert the predicted token IDs to text and add them to the predictions list
generated_ids = torch.argmax(outputs.logits, dim=-1) # Get the token IDs with the highest probability
batch_predictions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
train_predictions.extend(batch_predictions)
# Convert the ground-truth token IDs to text and add them to the references list
target_texts = tokenizer.batch_decode(target_ids_batch, skip_special_tokens=True)
train_references.extend([[text] for text in target_texts]) # Adding each target summary as a list for BLEU calculation
average_loss = total_loss / len(train_loader)
train_loss_lis.append(average_loss)
train_jac_scores = [jaccard_similarity(set(" ".join(pred).split()), set(" ".join(ref).split())) for pred, ref in zip(train_predictions, train_references)]
train_mean_jaq_similiarity_score = np.mean(train_jac_scores)
train_mean_cosine_similarity = calculate_cosine_similarity(train_predictions, train_references).mean()
train_score_lis.append((train_mean_jaq_similiarity_score,train_mean_cosine_similarity))
for i in range(len(train_source)):
train_data.append((train_source[i], train_target[i], train_predictions[i], train_references[i]))
train_data_by_epoch.append(train_data)
test_loss = 0
test_predictions = []
test_references = []
model.eval()
with torch.no_grad():
for test_batch in test_loader:
test_input_ids_batch, test_attn_mask_batch, test_target_ids_batch = test_batch
test_input_ids_batch, test_attn_mask_batch, test_target_ids_batch = \
test_input_ids_batch.to(device), test_attn_mask_batch.to(device), test_target_ids_batch.to(device)
test_outputs = model(input_ids=test_input_ids_batch, attention_mask=test_attn_mask_batch, labels=test_target_ids_batch)
test_loss += test_outputs.loss.item()
generated_ids_test = torch.argmax(test_outputs.logits, dim=-1)
test_batch_predictions = tokenizer.batch_decode(generated_ids_test, skip_special_tokens=True)
test_predictions.extend(test_batch_predictions)
target_texts_test = tokenizer.batch_decode(test_target_ids_batch, skip_special_tokens=True)
test_references.extend([text for text in target_texts_test])
average_test_loss = test_loss / len(test_loader)
test_loss_lis.append(average_test_loss)
test_loss_stdev = np.std(test_loss)
test_jac_scores = [
jaccard_similarity(set(" ".join(pred).split()), set(" ".join(ref).split()))
for pred, ref in zip(test_predictions, test_references)
]
test_mean_jaq_similiarity_score = np.mean(test_jac_scores)
test_mean_cosine_similarity = calculate_cosine_similarity(test_predictions, test_references).mean()
test_score_lis.append((test_mean_jaq_similiarity_score,test_mean_cosine_similarity))
for i in range(len(test_source)):
test_data.append((test_source[i], test_target[i], test_predictions[i], test_references[i]))
test_data_by_epoch.append(test_data)
print(f'Epoch {epoch+1} Train Loss: {average_loss} Test Loss {average_test_loss} Test Jaccard Score {test_mean_jaq_similiarity_score} Test Cosine Similarity: {test_mean_cosine_similarity}')
train_data_directory = os.getcwd() + '/results/predictions/speechBart/train/'
test_data_directory = os.getcwd() + '/results/predictions/speechBart/test/'
train_data_paths = [f'{train_data_directory}train_data_epoch_{epoch + 1}.txt' for epoch in range(num_epochs)]
test_data_paths = [f'{test_data_directory}test_data_epoch_{epoch + 1}.txt' for epoch in range(num_epochs)]
for epoch, epoch_train_data in enumerate(train_data_by_epoch):
with open(train_data_paths[epoch], 'w') as train_file:
for data_point in epoch_train_data:
train_file.write(f'Speech: {data_point[0]}\n')
train_file.write(f'LTL: {data_point[1]}\n')
train_file.write(f'Predictions: {data_point[2]}\n')
train_file.write(f'References: {data_point[3]}\n\n')
for epoch, epoch_test_data in enumerate(test_data_by_epoch):
with open(test_data_paths[epoch], 'w') as test_file:
for data_point in epoch_test_data:
test_file.write(f'Speech: {data_point[0]}\n')
test_file.write(f'LTL: {data_point[1]}\n')
test_file.write(f'Predictions: {data_point[2]}\n')
test_file.write(f'References: {data_point[3]}\n\n')
with open(train_scores_file, 'w') as file:
# Write each element of the list to the file
for item in train_score_lis:
file.write(str(item) + '\n')
with open(test_scores_file, 'w') as file:
# Write each element of the list to the file
for item in test_score_lis:
file.write(str(item) + '\n')
model_file = os.getcwd() + '/results/speech_bart.pth'
torch.save(model, model_file)