-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTestDiversity.py
75 lines (60 loc) · 3.19 KB
/
TestDiversity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from metrics.Diversity import diversity
import nltk
from matplotlib import pyplot as plt
from tqdm import tqdm
DATA_FILE = 'data/emnlp_news.txt'
TEST_FILE = 'data/test_emnlp.txt'
GENERATED_FILE = 'data/generated_text2.txt'
def get_sentences(filename):
""" Return sentences given a text file.
The sentences will be tokenized in this function.
"""
with open(filename, mode='r', encoding="ISO-8859-1") as f:
data = f.read()
sentences = nltk.sent_tokenize(data)
return sentences
def find_plot_diversities(test_sentences, corpus_sentences, diversity_file, metric):
diversities = list()
num_of_tests = len(test_sentences)
print("Example corpus sentence: ", corpus_sentences[0])
print("Example test sentence: ", test_sentences[0])
for sentence in tqdm(test_sentences[:num_of_tests], desc="Generated sentences"):
div, max_sim_sentence = diversity(sentence, corpus_sentences, metric)
print(div, max_sim_sentence)
diversities.append(div)
# Minimum diversity can be used to then find the sentence and potentially
# discover reasons causing diversity to decrease
min_diversity = min(diversities)
min_diversity_idx = diversities.index(min_diversity)
print("Min diversity: {}".format(min_diversity))
print("Sentence with min diversity: {}".format(test_sentences[min_diversity_idx]))
# print("Novelties for {} sentences: \n {}".format(num_of_tests, diversities))
with open(diversity_file, mode='w', encoding='utf-8') as f:
f.write('all_diversities = \n')
f.write('[\n')
f.writelines(',\n'.join(str(div) for div in diversities))
f.write('\n]')
# # plot diversities against sentence
# plt.plot(range(len(diversities)), diversities)
# plt.xlabel('Sentence')
# plt.ylabel('Novelty')
# plt.show()
if __name__ == '__main__':
# save these sentences and diversities to save computation time
# test_sentences = get_sentences(TEST_FILE) # 10785 sentences
generated_sentences = get_sentences(GENERATED_FILE) # 11055 sentences
# find diversities within the corpus
find_plot_diversities(generated_sentences[:50], generated_sentences,
diversity_file='sent/jac_diversities_fake_sent.txt', metric='jaccard')
# python Test_Diversity.py
# Example corpus sentence: My sources have suggested that so far the company sees no reason to change its tax structures , which are perfectlyge its tax structures , which are perfectly legal .
# '
# Example test sentence: a bathroom with a glass shower , sink and white .
# Min diversity: 0.7407407407407407
# Sentence with min diversity: a group of motorcycles parked on the sidewalks in a field .
# (tf-gpu) C:\Users\deb.chk\Documents\GitHub\NLP-tools>python Test_Diversity.py
# Example corpus sentence: a bathroom with a glass shower , sink and white .
# Example test sentence: a bathroom with a glass shower , sink and white .
# Generated sentences: 100%|███████████████████████████████████| 9003/9003 [04:33<00:00, 32.86it/s]
# Min diversity: 0.3125
# Sentence with min diversity: a bathroom with a mirrors reflection on far in the toilet .