-
Notifications
You must be signed in to change notification settings - Fork 2
/
sentiment_word_stat.py
107 lines (89 loc) · 4.26 KB
/
sentiment_word_stat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import json
from os.path import join
import re
import argparse
def _count_data(path):
""" count number of data in the given path"""
matcher = re.compile(r'[0-9]+\.json')
match = lambda name: bool(matcher.match(name))
names = os.listdir(path)
n_data = len(list(filter(match, names)))
return n_data
def read_word_list_from_file(filename):
with open(filename) as f:
word_list = [l.strip() for l in f.readlines()]
return word_list
def main(data_dir, split, positive_words_file, negative_words_file):
split_dir = join(data_dir, split)
n_data = _count_data(split_dir)
positive_word_list = read_word_list_from_file(positive_words_file)
negative_word_list = read_word_list_from_file(negative_words_file)
sentiment_word_list = positive_word_list + negative_word_list
total_num_summ_with_sent_word = 0
total_num_sent_words_in_summ = 0
total_num_review = 0
total_num_review_sents = 0
total_num_review_tokens = 0
total_num_summary_tokens = 0
max_review_tokens = 0
max_summary_tokens = 0
total_num_present_summary_tokens = 0
total_num_short_review = 0
total_num_short_summary = 0
for i in range(n_data):
total_num_review += 1
js = json.load(open(join(split_dir, '{}.json'.format(i))))
review_sent_list = js['reviewText']
num_review_sents = len(review_sent_list)
review_text = ' '.join(review_sent_list)
review_word_list = review_text.split(' ')
num_review_tokens = len(review_word_list)
summary_sent_list = js['summary']
summary_text = ' '.join(summary_sent_list)
summary_word_list = summary_text.split(' ')
num_summary_tokens = len(summary_word_list)
if num_summary_tokens < 2:
total_num_short_summary += 1
if num_review_tokens < 8:
total_num_short_review += 1
num_matched_sent_words = 0
for w in summary_word_list:
if w in sentiment_word_list:
num_matched_sent_words += 1
if w in review_word_list:
total_num_present_summary_tokens += 1
if num_matched_sent_words > 0:
total_num_summ_with_sent_word += 1
total_num_sent_words_in_summ += num_matched_sent_words
total_num_review_sents += num_review_sents
total_num_review_tokens += num_review_tokens
total_num_summary_tokens += num_summary_tokens
if num_review_tokens > max_review_tokens:
max_review_tokens = num_review_tokens
if num_summary_tokens > max_summary_tokens:
max_summary_tokens = num_summary_tokens
print("% of summary contains sentiment words:\t{:.2f}".format(total_num_summ_with_sent_word/total_num_review * 100))
print("avg # of sentiment word per summary:\t{:.3f}".format(total_num_sent_words_in_summ/total_num_review))
print("avg # tokens in summary:\t{:.3f}".format(total_num_summary_tokens/total_num_review))
print("avg # tokens in review:\t{:.3f}".format(total_num_review_tokens/total_num_review))
print("avg # sentences in review:\t{:.3f}".format(total_num_review_sents/total_num_review))
print("max # tokens in summary:\t{}".format(max_summary_tokens))
print("max # tokens in review:\t{}".format(max_review_tokens))
print("% of present tokens in summary:\t{:.2f}".format(total_num_present_summary_tokens/total_num_summary_tokens * 100))
print("# short reviews:\t{}".format(total_num_short_review))
print("# short summaries:\t{}".format(total_num_short_summary))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=('Preprocess review data')
)
parser.add_argument('-data_dir', type=str, action='store',
help='The directory of the data.')
parser.add_argument('-split', type=str, action='store',
help='train or val or test.')
parser.add_argument('-positive_words_file', type=str, action='store',
help='Path the file of positive sentiment words.')
parser.add_argument('-negative_words_file', type=str, action='store',
help='Path the file of negative sentiment words.')
args = parser.parse_args()
main(args.data_dir, args.split, args.positive_words_file, args.negative_words_file)