forked from anlausch/XWEAT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprintAraWEATLists.py
267 lines (225 loc) · 13.5 KB
/
printAraWEATLists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import pickle
import codecs
lang = "ar"
output_file = codecs.open("WEAT_Arabic_list" ,"w", "utf-8")
def weat_1():
"""
WEAT 1 (universally accepted): Targets 1=Flowers, Targets 2=Insects, Attributes 1=pleasant, Attributes 2=unpleasant
:return: targets_1, targets_2, attributes_1, attributes_2
"""
targets_1 = ["aster", "clover", "hyacinth", "marigold", "poppy", "azalea", "crocus", "iris", "orchid", "rose",
"blue-bell", "daffodil", "lilac", "pansy", "tulip", "buttercup", "daisy", "lily", "peony",
"violet",
"carnation", "gladiola", "magnolia", "petunia", "zinnia"]
targets_2 = ["ant", "caterpillar", "flea", "locust", "spider", "bedbug", "centipede", "fly", "maggot",
"tarantula",
"bee", "cockroach", "gnat", "mosquito", "termite", "beetle", "cricket", "hornet", "moth", "wasp",
"blackfly",
"dragonfly", "horsefly", "roach", "weevil"]
attributes_1 = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal",
"pleasure",
"diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle",
"sunrise",
"family",
"happy", "laughter", "paradise", "vacation"]
attributes_2 = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison",
"stink",
"assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly",
"cancer",
"kill", "rotten",
"vomit", "agony", "prison"]
return targets_1, targets_2, attributes_1, attributes_2
# WEAT 2: Target 1=Musical instruments, Target 2= Weapons, Attributes 1: pleasant, Attributes 2=unpleasant
def weat_2():
targets_1 = ["bagpipe", "cello", "guitar", "lute", "trombone", "banjo", "clarinet", "harmonica", "mandolin",
"trumpet", "bassoon", "drum", "harp", "oboe", "tuba", "bell", "fiddle", "harpsichord", "piano",
"viola", "bongo",
"flute", "horn", "saxophone", "violin"]
targets_2 = ["arrow", "club", "gun", "missile", "spear", "axe", "dagger", "harpoon", "pistol", "sword", "blade",
"dynamite", "hatchet", "rifle", "tank", "bomb", "firearm", "knife", "shotgun", "teargas", "cannon",
"grenade",
"mace", "slingshot", "whip"]
attributes_1 = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal",
"pleasure",
"diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle",
"sunrise",
"family", "happy", "laughter", "paradise", "vacation"]
attributes_2 = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison",
"stink",
"assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly",
"cancer",
"kill", "rotten",
"vomit", "agony", "prison"]
return targets_1, targets_2, attributes_1, attributes_2
# Here they deleted the infrequent african american names, and the same number randomly choosen from the european american names
def weat_3():
# excluded in the original paper: Chip, Ian, Fred, Jed, Todd, Brandon, Wilbur, Sara, Amber, Crystal, Meredith, Shannon, Donna,
# Bobbie-Sue, Peggy, Sue-Ellen, Wendy
targets_1 = ["Adam", "Harry", "Josh", "Roger", "Alan", "Frank", "Justin", "Ryan", "Andrew", "Jack", "Matthew",
"Stephen",
"Brad", "Greg", "Paul", "Hank", "Jonathan", "Peter", "Amanda", "Courtney", "Heather", "Melanie",
"Katie", "Betsy", "Kristin", "Nancy", "Stephanie", "Ellen", "Lauren", "Colleen", "Emily", "Megan",
"Rachel",
"Chip", "Ian", "Fred", "Jed", "Todd", "Brandon", "Wilbur", "Sara", "Amber", "Crystal", "Meredith",
"Shannon",
"Donna", "Bobbie-Sue", "Peggy", "Sue-Ellen", "Wendy"]
# excluded: Lerone, Percell, Rasaan, Rashaun, Everol, Terryl, Aiesha, Lashelle, Temeka, Tameisha, Teretha, Latonya, Shanise,
# Sharise, Tashika, Lashandra, Shavonn, Tawanda,
targets_2 = ["Alonzo", "Jamel", "Theo", "Alphonse", "Jerome", "Leroy", "Torrance", "Darnell", "Lamar", "Lionel",
"Tyree", "Deion", "Lamont", "Malik", "Terrence", "Tyrone", "Lavon", "Marcellus", "Wardell",
"Nichelle",
"Shereen", "Ebony", "Latisha", "Shaniqua", "Jasmine", "Tanisha", "Tia", "Lakisha", "Latoya",
"Yolanda",
"Malika", "Yvette", "Lerone", "Percell", "Rasaan", "Rashaun", "Everol", "Terryl", "Aiesha",
"Lashelle",
"Temeka", "Tameisha", "Teretha", "Latonya", "Shanise", "Sharise", "Tashika", "Lashandra",
"Shavonn", "Tawanda"]
attributes_1 = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal",
"pleasure",
"diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle",
"sunrise",
"family", "happy", "laughter", "paradise", "vacation"]
# same as before but agony and prison are replaced with bomb and evil
attributes_2 = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison",
"stink",
"assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly",
"cancer",
"kill", "rotten",
"vomit", "bomb", "evil"]
return targets_1, targets_2, attributes_1, attributes_2
# again: african american names vs. european american names and pleasant vs unpleasant but with different names
def weat_4():
# excluded as in the original paper: Jay, Kristen, (here only excluded in the glove experiments)
targets_1 = ["Brad", "Brendan", "Geoffrey", "Greg", "Brett", "Matthew", "Neil", "Todd", "Allison", "Anne",
"Carrie", "Emily", "Jill", "Laurie", "Meredith", "Sarah", "Jay", "Kristen"]
# excluded in GloVe experiments: Tremayne, Latonya,
targets_2 = ["Darnell", "Hakim", "Jermaine", "Kareem", "Jamal", "Leroy", "Rasheed", "Tyrone", "Aisha", "Ebony",
"Keisha", "Kenya", "Lakisha", "Latoya", "Tamika", "Tanisha", "Tremayne", "Latonya"]
attributes_1 = ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal",
"pleasure",
"diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle",
"sunrise",
"family", "happy", "laughter", "paradise", "vacation"]
# same as before but agony and prison are replaced with bomb and evil
attributes_2 = ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison",
"stink",
"assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly",
"cancer",
"kill", "rotten",
"vomit", "bomb", "evil"]
return targets_1, targets_2, attributes_1, attributes_2
# again african american vs european american names, but with different attributes
def weat_5():
# excluded as in the original paper: Jay, Kristen, (here only excluded in the glove experiments)
targets_1 = ["Brad", "Brendan", "Geoffrey", "Greg", "Brett", "Matthew", "Neil", "Todd", "Allison", "Anne",
"Carrie", "Emily", "Jill", "Laurie", "Meredith", "Sarah", "Jay", "Kristen"]
# excluded in GloVe experiments: Tremayne, Latonya,
targets_2 = ["Darnell", "Hakim", "Jermaine", "Kareem", "Jamal", "Leroy", "Rasheed", "Tyrone", "Aisha", "Ebony",
"Keisha", "Kenya", "Lakisha", "Latoya", "Tamika", "Tanisha", "Tremayne", "Latonya"]
attributes_1 = ["joy", "love", "peace", "wonderful", "pleasure", "friend", "laughter", "happy"]
attributes_2 = ["agony", "terrible", "horrible", "nasty", "evil", "war", "awful", "failure"]
return targets_1, targets_2, attributes_1, attributes_2
# sex- career
def weat_6():
targets_1 = ["John", "Paul", "Mike", "Kevin", "Steve", "Greg", "Jeff", "Bill"]
targets_2 = ["Amy", "Joan", "Lisa", "Sarah", "Diana", "Kate", "Ann", "Donna"]
attributes_1 = ["executive", "management", "professional", "corporation", "salary", "office", "business",
"career"]
attributes_2 = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]
return targets_1, targets_2, attributes_1, attributes_2
# sex - math
def weat_7():
targets_1 = ["math", "algebra", "geometry", "calculus", "equations", "computation", "numbers", "addition"]
targets_2 = ["poetry", "art", "dance", "literature", "novel", "symphony", "drama", "sculpture"]
attributes_1 = ["male", "man", "boy", "brother", "he", "him", "his", "son"]
attributes_2 = ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]
return targets_1, targets_2, attributes_1, attributes_2
# sex - science
def weat_8():
targets_1 = ["science", "technology", "physics", "chemistry", "Einstein", "NASA", "experiment", "astronomy"]
targets_2 = ["poetry", "art", "Shakespeare", "dance", "literature", "novel", "symphony", "drama"]
attributes_1 = ["brother", "father", "uncle", "grandfather", "son", "he", "his", "him"]
attributes_2 = ["sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]
return targets_1, targets_2, attributes_1, attributes_2
# desease - uncontrollability
def weat_9():
targets_1 = ["sad", "hopeless", "gloomy", "tearful", "miserable", "depressed"]
targets_2 = ["sick", "illness", "influenza", "disease", "virus", "cancer"]
# in w2v experiments, the've replaced short-term with short
attributes_1 = ["impermanent", "unstable", "variable", "fleeting", "short-term", "brief", "occasional"]
attributes_2 = ["stable", "always", "constant", "persistent", "chronic", "prolonged", "forever"]
return targets_1, targets_2, attributes_1, attributes_2
# old - pleasant
def weat_10():
targets_1 = ["Tiffany", "Michelle", "Cindy", "Kristy", "Brad", "Eric", "Joey", "Billy"]
targets_2 = ["Ethel", "Bernice", "Gertrude", "Agnes", "Cecil", "Wilbert", "Mortimer", "Edgar"]
attributes_1 = ["joy", "love", "peace", "wonderful", "pleasure", "friend", "laughter", "happy"]
attributes_2 = ["agony", "terrible", "horrible", "nasty", "evil", "war", "awful", "failure"]
return targets_1, targets_2, attributes_1, attributes_2
def load_vocab_goran(path): #load pickle files
return pickle.load(open(path, "rb"))
def translate(translation_dict, terms):
translation = []
for t in terms:
if t in translation_dict or t.lower() in translation_dict:
if t.lower() in translation_dict:
male, female = translation_dict[t.lower()]
elif t in translation_dict:
male, female = translation_dict[t]
if female is None or female is '':
translation.append(male)
else:
translation.append(male)
translation.append(female)
else:
translation.append(t)
translation = list(set(translation))
return translation
def ommit(opened_file, targets_1, targets_2, attributes_1, attributes_2,test_number ):
opened_file.writelines("Test Number: " + str(test_number) + "\n")
opened_file.write("TI"+ "\t")
for i in targets_1:
opened_file.write(i +"\t")
opened_file.write("\n")
opened_file.write("T2" + "\t")
for i in targets_2:
opened_file.write(i + "\t")
opened_file.write("\n")
opened_file.write("A1" + "\t")
for i in attributes_1:
opened_file.write(i + "\t")
opened_file.write("\n")
opened_file.write("A2" + "\t")
for i in attributes_2:
opened_file.write(i + "\t")
opened_file.write("\n")
for test_number in range(1,11):
if test_number == 1:
targets_1, targets_2, attributes_1, attributes_2 = weat_1()
elif test_number == 2:
targets_1, targets_2, attributes_1, attributes_2 = weat_2()
elif test_number == 3:
targets_1, targets_2, attributes_1, attributes_2 = weat_3()
elif test_number == 4:
targets_1, targets_2, attributes_1, attributes_2 = weat_4()
elif test_number == 5:
targets_1, targets_2, attributes_1, attributes_2 = weat_5()
elif test_number == 6:
targets_1, targets_2, attributes_1, attributes_2 = weat_6()
elif test_number == 7:
targets_1, targets_2, attributes_1, attributes_2 = weat_7()
elif test_number == 8:
targets_1, targets_2, attributes_1, attributes_2 = weat_8()
elif test_number == 9:
targets_1, targets_2, attributes_1, attributes_2 = weat_9()
elif test_number == 10:
targets_1, targets_2, attributes_1, attributes_2 = weat_10()
else:
raise ValueError("Only WEAT 1 to 10 are supported")
translation_dict = load_vocab_goran("data/vocab_dict_en_arNoClean.p")
targets_1 = translate(translation_dict, targets_1)
targets_2 = translate(translation_dict, targets_2)
attributes_1 = translate(translation_dict, attributes_1)
attributes_2 = translate(translation_dict, attributes_2)
ommit(output_file, targets_1, targets_2, attributes_1, attributes_2, test_number)
output_file.close()