forked from vnmssnhv/NeuTralRewriter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreatetestset.py
124 lines (104 loc) · 4.21 KB
/
createtestset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Retrieve he, she, him, his, her, hers, himself, themself sentences
import codecs
import argparse
def maketestset(infile, outfile, total_counts):
sentences = read_file(infile)
testset= find_sents(sentences, total_counts)
print(len(testset))
write_output(outfile,testset)
def read_file(input):
with codecs.open(input,'r') as inF:
input_sents=inF.readlines()
return input_sents
def write_output(outputfile,output):
with codecs.open(outputfile, 'w') as outF:
outF.write(''.join(output))
def find_sents(sentences, total_counts):
hecounter=0
shecounter=0
hercounter=0
herscounter=0
hiscounter=0
himcounter=0
himselfcounter=0
herselfcounter=0
totalcounter=0
maxcount = total_counts // 8 #8 is the number of all pronouns
testset = []
for sent in sentences:
addSent=False
if hecounter < maxcount:
if ' he ' in sent.lower():
addSent = True
hecount = countOccurences(sent, 'he')
hecounter += hecount
totalcounter += hecount
if shecounter < maxcount:
if ' she ' in sent.lower():
addSent = True
shecount = countOccurences(sent, 'she')
shecounter += shecount
totalcounter += shecount
if hercounter < maxcount:
if ' her ' in sent.lower():
addSent = True
hercount = countOccurences(sent, 'her')
hercounter += hercount
totalcounter += hercount
if hiscounter < maxcount:
if ' his ' in sent.lower():
addSent = True
hiscount = countOccurences(sent, 'his')
hiscounter += hiscount
totalcounter += hiscount
if herscounter < maxcount:
if ' hers ' in sent.lower():
addSent = True
herscount = countOccurences(sent, 'hers')
herscounter += herscount
totalcounter += herscount
if himcounter < maxcount:
if ' him ' in sent.lower():
addSent = True
himcount = countOccurences(sent, 'him')
himcounter += himcount
totalcounter += himcount
if herselfcounter < maxcount:
if ' herself ' in sent.lower():
addSent = True
herselfcount = countOccurences(sent, 'herself')
herselfcounter += herselfcount
totalcounter += herselfcount
if himselfcounter < maxcount:
if ' himself ' in sent.lower():
addSent = True
himselfcount = countOccurences(sent, 'himself')
himselfcounter += himselfcount
totalcounter += himselfcount
if addSent == True:
testset.append(sent)
print("hecounter: " + str(hecounter) + "\n" + "shecounter: " + str(shecounter) + "\n" \
"hercounter: " + str(hercounter) + "\n" + "himcounter: " + str(himcounter) + "\n" \
"herscounter: " + str(herscounter) + "\n" + "hiscounter: " + str(hiscounter) + "\n" \
"herselfcounter: " + str(herselfcounter) + "\n" + "himselfcounter: " + str(himselfcounter) + "\n" \
"totalcounter: " + str(totalcounter))
return testset
def countOccurences(str, word):
# split the string by spaces in a
a = str.lower().split(" ")
# print(str)
# search for pattern in a
count = 0
for i in range(0, len(a)):
# if match found increase count
if (word == a[i]):
count = count + 1
return count
if __name__ == '__main__':
# USAGE: python createtestset.py -i inputF -o outputF
parser = argparse.ArgumentParser(description='parse sentences using stanzaNLP')
parser.add_argument("-i", "--input_file", required=True)
parser.add_argument("-o", "--output_file", required=True)
parser.add_argument("-t", "--total_count", required=True, default=2000)
args = parser.parse_args()
maketestset(args.input_file, args.output_file, int(args.total_count))