-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrewrite_neutral.py
365 lines (333 loc) · 12.9 KB
/
rewrite_neutral.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
import argparse
import re
import language_tool_python
import stanza
# reproducibility bit ----------------
from random import seed
from numpy.random import seed as np_seed
import os
seed(42)
np_seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
# -----------------------------------
class NeutralRewriter(object):
def __init__(self, language='en', parse=False, advanced=False):
"""Applies rule-based neutral rewrite operations.
Parameters
----------
language : str, optional
language identifier for stanza, by default 'en'
parse : bool, optional
if doc requires parsing using stanza, by default False
advanced : bool, optional
invoke advanced gender neutral word replacements, by default False
"""
self.stanza = self._stanza_init(language, parse)
self.tool = language_tool_python.LanguageTool('en-US')
self.parse = parse
self.advanced = advanced
def _stanza_init(self, language, parse):
"""Initalizes stanza either for tokenization or not."""
return stanza.Pipeline(
lang=language, processors='tokenize,mwt,pos,lemma,depparse',
tokenize_pretokenized='true' if not parse else 'false')
@staticmethod
def dict_replace(sent, d):
"""Replaces strings in sent based on key value pairs in d.
Parameters
----------
sent : str
a sentence from a document
d : dict
key is query, value is replacement string
Returns
-------
str
a sentence with word keys replaced
"""
for hit in re.findall(r'|'.join(d), sent):
sent = sent.replace(hit, d[hit])
return sent
def _genderneutral(self, sent):
"""Rewrite specific words to their gender neutral variants."""
gender_lang = {
#################################
# 1. CHANGE INTO GENDER NEUTRAL #
#################################
# chairman/woman
'chairman': 'chairperson',
'chairmen': 'chairpeople',
'chairwoman': 'chairperson',
'chairwomen': 'chairpeople',
# anchorman/woman
'anchorman': 'anchor',
'anchormen': 'anchors',
'anchorwoman': 'anchor',
'anchorwomen': 'anchors',
# congresswoman/congressman
'congressman': 'member of congress',
'congressmen': 'members of congress',
'congresswoman': 'member of congress',
'congresswomen': 'members of congress',
# policeman/woman
'policeman': 'police officer',
'policemen': 'police officers',
'policewoman': 'police officer',
'policewomen': 'police officers',
# spokesman/woman
'spokesman': 'spokesperson',
'spokesmen': 'spokespersons',
'spokeswoman': 'spokesperson',
'spokeswomen': 'spokespersons',
# steward/stewardess
'steward': 'flight attendant',
'stewards': 'flight attendants',
'stewardess': 'flight attendant',
'stewardesses': 'flight attendants',
# headmaster/mistress
'headmaster': 'principal',
'headmasters': 'principals',
'headmistress': 'principal',
'headmistresses': 'principals',
# business man/woman
'businessman': 'business person',
'businessmen': 'business people',
'businesswoman': 'business person',
'businesswomen': 'business persons',
# postman/postwoman
'postman': 'mail carrier',
'postmen': 'mail carriers',
'postwoman': 'mail carrier',
'postwomen': 'mail carriers',
# mailman/mailwoman
'mailman': 'mail carrier',
'mailmen': 'mail carriers',
'mailwoman': 'mail carrier',
'mailwomen': 'mail carriers',
# salesman/saleswoman
'salesman': 'salesperson',
'salesmen': 'salespersons',
'saleswoman': 'salesperson',
'saleswomen': 'salespersons',
# fireman/firewoman
'fireman': 'firefighter',
'firemen': 'firefighters',
'firewoman': 'firefighter',
'firewomen': 'firefighter',
# barman/barwoman
'barman': 'bartender',
'barmen': 'bartenders',
'barwoman': 'bartender',
'barwomen': 'bartenders',
# cleaning lady
'cleaning man': 'cleaner',
'cleaning lady': 'cleaners',
'cleaning men': 'cleaner',
'cleaning ladies': 'cleaners',
# foreman/woman
'foreman': 'supervisor',
'foremen': 'supervisors',
'forewoman': 'supervisor',
'forewomen': 'supervisors',
#######################################
# 2. AVOID UNNECESSARY FEMININE FORMS #
#######################################
# actor/actress
'actress': 'actor',
'actresses': 'actors',
# hero/heroine
'heroine': 'hero',
'heroines': 'heros',
# comedian/comedienne
'comedienne': 'comedian',
'comediennes': 'comedians',
# executrix/executor
'executrix': 'executor',
'executrices': 'executors',
'executrixes': 'executors',
# poetess/poet
'poetess': 'poet',
'poetesses': 'poets',
# usherette/usher
'usherette': 'usher',
'usherettes': 'ushers',
# authoress/author
'authoress': 'author',
'authoresses': 'authors',
# boss lady
'boss lady': 'boss',
'boss ladies': 'bosses',
# boss lady
'waitress': 'waiter',
'waitresses': 'waiters',
#################################
# 3. AVOIDANCE OF GENERIC 'MAN' #
#################################
# average man
'average man': 'average person',
'average men': 'average people',
# best man for the job
'best man for the job': ' best person for the job',
'best men for the job': ' best people for the job',
# layman
'layman': 'layperson',
'laymen': 'laypeople',
# man and wife
# left space (otherwise e.g. woman and wife => wohusband and wife,
' man and wife': ' husband and wife',
# mankind
# left space (otherwise e.g. humankind => huhumankind,
' mankind': ' humankind',
# man-made
# left space (otherwise e.g. human-made => huhuman-made,
' man-made': ' human-made',
# manpower
# 'manpower': 'staff', Depends on context
# workmanlike
'workmanlike': 'skillful',
# workmanlike
'freshman': 'first-year student'
}
return self.dict_replace(sent, gender_lang)
def _correctgram(self, sent):
"""Manual correct word grams and automatic correct grammar."""
d1 = {
"’": "'",
'they is ': 'they are ',
'They is ': ' They are ',
'They was ': 'They were ',
'they was ': 'they were ',
'They wasn ': 'They weren ',
'they wasn ': 'they weren ',
"they 's ": "they are ",
"they ' s ": "they are ",
"They ' s ": "They are ",
"They 's ": "They are ",
"They does ": "They do ",
"they does ": "they do "
}
d2 = {
"'t 't": " 't",
"'t ' t": " 't",
"' t ' t": " 't",
"they doesn": "they don",
"They doesn": "They don",
'they isn ': 'they aren ',
'They isn ': ' They aren ',
"they hasn": "they haven",
"They hasn": "They haven"
}
sent = self.dict_replace(sent, d1)
matches = self.tool.check(sent)
new_matches = [match for match in matches if match.category ==
'GRAMMAR'] # correct only grammar issues
sent = language_tool_python.utils.correct(sent, new_matches)
return self.dict_replace(sent, d2)
@staticmethod
def match_case(word, query):
"""Case query based on casing of word."""
return query.capitalize() if word.text[0].isupper() else query
def process_sentence(self, sent, parse=False):
"""Process sentence (runs full rule-based neutralizer).
Parameters
----------
sent : str
single sentence to be converted into gender neutral variants
parse : bool, optional
if sentence still requires stanza parsing, by default False
Returns
-------
str
gender-neutral sentence
"""
if parse:
sent = self.stanza(sent).sentences[0]
sent_map = [word.text for word in sent.words]
for i, word in enumerate(sent.words):
_word = word.text.lower()
if _word == 'he' or _word == 'she':
sent_map[i] = self.match_case(word, 'they')
elif _word == 'his':
sent_map[i] = self.match_case(word, 'their')
if word.deprel != "nmod:poss":
sent_map[i] += 's' # theirs
elif _word == 'her' or _word == 'her.':
if word.xpos == "PRP$" and word.text == 'her':
sent_map[i] = self.match_case(word, 'their')
elif word.xpos == "PRP" and "Poss=Yes" in word.feats:
sent_map[i] = self.match_case(word, 'their')
elif word.text == 'her.':
sent_map[i] = self.match_case(word, 'them')
else:
sent_map[i] = self.match_case(word, 'them')
elif word.text == 'himself' or word.text == 'herself':
sent_map[i] = self.match_case(word, 'themselves')
elif word.text == 'hers':
sent_map[i] = self.match_case(word, 'theirs')
elif word.text == 'him':
sent_map[i] = self.match_case(word, 'them')
new_sent = " ".join(sent_map)
if self.advanced:
new_sent = self._genderneutral(new_sent)
return self._correctgram(new_sent)
def process_document(self, document):
"""Splits document in sentences and rewrites those to gender neutral.
Parameters
----------
document : str
a document with multiple sentences, to be parsed
Yields
-------
str
a sentence that has been made gender neutral
"""
for sent in self.stanza(document).sentences:
yield self.process_sentence(sent)
def process_file(self, file_in):
"""Process file_in line by line to rewrite gender-neutral.
Parameters
----------
file_in : str
full path to file to be rewritten
Yields
-------
str
a sentence that has been made gender neutral
"""
with open(file_in, 'r') as fi:
for line in fi.readlines():
if self.parse:
for sent in self.process_document(line):
yield sent
else:
yield self.process_sentence(line, parse=True)
def save(self, output, output_file):
"""Save output to output_file.
Parameters
----------
output : iterable
an interable containing sentences
output_file : str
full file path to file to be saved
"""
with open(output_file, 'w') as fo:
for sent in output:
fo.write(sent + '\n')
if __name__ == '__main__':
# USAGE: python rewrite_neutral.py -i inputF -l en -o outputF
parser = argparse.ArgumentParser(
description='parse sentences using stanzaNLP')
parser.add_argument("-i", "--input_file", required=True)
parser.add_argument("-d", "--documents", required=False, default=False,
action='store_true',
help="Inputs are documents rather than sentences.")
parser.add_argument("-l", "--language", required=False, # EN only v
default='en',
help="Specify language code, e.g. en, es, fr...")
parser.add_argument("-a", "--advanced", required=False, default=False,
action='store_true',
help="Invokes the more advanced rewriting")
parser.add_argument("-o", "--output_file", required=False)
args = parser.parse_args()
nr = NeutralRewriter(args.language, args.documents, args.advanced)
nr.save(nr.process_file(args.input_file), args.output_file)