-
Notifications
You must be signed in to change notification settings - Fork 2
/
find_intro_case.py
78 lines (62 loc) · 2.62 KB
/
find_intro_case.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import json
from os.path import join
import re
import argparse
def _count_data(path):
""" count number of data in the given path"""
matcher = re.compile(r'[0-9]+\.json')
match = lambda name: bool(matcher.match(name))
names = os.listdir(path)
n_data = len(list(filter(match, names)))
return n_data
def read_word_list_from_file(filename):
with open(filename) as f:
word_list = [l.strip() for l in f.readlines()]
return word_list
def main(data_dir, split, positive_words_file, negative_words_file):
split_dir = join(data_dir, split)
n_data = _count_data(split_dir)
positive_word_list = read_word_list_from_file(positive_words_file)
negative_word_list = read_word_list_from_file(negative_words_file)
sentiment_word_list = positive_word_list + negative_word_list
total_num_review = 0
for i in range(n_data):
total_num_review += 1
js = json.load(open(join(split_dir, '{}.json'.format(i))))
rating = js['overall']
review_sent_list = js['reviewText']
num_review_sents = len(review_sent_list)
review_text = ' '.join(review_sent_list)
review_word_list = review_text.split(' ')
num_review_tokens = len(review_word_list)
summary_sent_list = js['summary']
summary_text = ' '.join(summary_sent_list)
summary_word_list = summary_text.split(' ')
num_summary_tokens = len(summary_word_list)
num_matched_sent_words = 0
for w in summary_word_list:
if w in sentiment_word_list:
num_matched_sent_words += 1
if num_matched_sent_words > 0 and num_summary_tokens > 5:
print("{}.json".format(i))
print("Rating: {}".format(rating))
print("Summary: ")
print(summary_text)
print("Review: ")
print(review_text)
print()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=('Preprocess review data')
)
parser.add_argument('-data_dir', type=str, action='store',
help='The directory of the data.')
parser.add_argument('-split', type=str, action='store',
help='train or val or test.')
parser.add_argument('-positive_words_file', type=str, action='store',
help='Path the file of positive sentiment words.')
parser.add_argument('-negative_words_file', type=str, action='store',
help='Path the file of negative sentiment words.')
args = parser.parse_args()
main(args.data_dir, args.split, args.positive_words_file, args.negative_words_file)