-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbaselineSingleDocument.py
45 lines (34 loc) · 1.49 KB
/
baselineSingleDocument.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from utils import STOPWORDS, LexRank
from path import Path
from myRouge import rouge_1
import rouge
import nltk.data
texts = []
text_dir = Path('data/single-document/BBC News Summary/text/politics')
summaries_dir = Path('data/single-document/BBC News Summary/summaries/politics')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
for file_path in text_dir.files('*.txt'):
with file_path.open(mode='rt', encoding='utf-8') as fp:
texts.append(fp.readlines())
lxr = LexRank(texts, stopwords=STOPWORDS['en'])
scores = {}
for i in range(1, 10):
text_file_path = text_dir + '/00' + str(i) + '.txt'
with text_file_path.open(mode='rt', encoding='utf-8') as fp:
# f = open("data/single-document/BBC News Summary/text/politics/001.txt", "r")
sentences = tokenizer.tokenize(fp.read())
# get summary with continuous LexRank
summarySentences = lxr.get_summary(sentences, summary_size=8, threshold=None, include_keyphrase_similarity = False, redunduncy_penalty = True)
summary_file_path = summaries_dir + '/00' + str(i) + '.txt'
with summary_file_path.open(mode='rt', encoding='utf-8') as fp:
modelSummarySentences = tokenizer.tokenize(fp.read())
producedSummary = ' '.join(summarySentences)
modelSummary = ' '.join(modelSummarySentences)
score = rouge_1(producedSummary, modelSummary)
scores[i] = score
# print(producedSummary)
# print('=' * 20)
# print(modelSummary)
# print('=' * 20)
# print(score)
print(scores)