-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspacy_parser.py
81 lines (73 loc) · 3.59 KB
/
spacy_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
from spacy.en import English
from datetime import datetime
def atrocities_parser(directory):
nlp = English()
starttime = datetime.now()
keywords = {'violence':('killed', 'killing', 'massacred','massacring','executed','executing'),
'target':('men','women','civilians','villagers','people'),
'months':('January','February','March','April','May','June','July','August',
'September','October','November','December'),
'numbers':('one','two','three','four','five','six','seven','eight','nine','ten')}
catch_words = ['anniversary','domestic','sentenced','condemned','trial','partner','partners','questioned',
'court','sentencing','sentences','sentence','charges','charged','charge','stalking']
years = range(1950,2015,1)
file_list = []
for path, subdirs, files in os.walk(directory):
for name in files:
if name.endswith((".txt")):
file_list.append(os.path.join(path,name))
list_holder = []
for filename in file_list:
## get year and country name from filename ##
path_split = filename.rsplit('/')[1:]
year = path_split[-1][3:7]
country = path_split[-1][8:-4]
print('parsing %s %s...' % (country, year))
## read in file and do some cleanup ##
with open(filename, encoding='utf-8') as f:
text = f.read().replace('\n',' ')
## parse sentence by sentence ##
doc = nlp(text)
sentences = []
for span in doc.sents:
sent = ''.join(doc[i].string for i in range(span.start,span.end)).strip()
sentences.append(sent)
for sentence in sentences:
tokens = nlp(sentence)
check = 0
viol_check = 0
num_check = 0
tar_check = 0
month_check = 0
dict_holder = {}
dict_holder = {'sentence':sentence, 'country':country, 'year':int(year)-1}
for token in tokens:
if token.orth_.isdigit():
if int(token.orth_) in years and int(token.orth_) != (int(year)-1):
check += 5
if token.orth_ in catch_words:
check += 5
if (token.orth_ in keywords['violence']) and check==0:
viol_check += 1
for token in tokens:
if token.orth_ in keywords['months'] and month_check==0:
month_dict = {'month':token.orth_}
month_check += 1
for token in tokens:
if token.orth_ in keywords['target'] and tar_check==0:
chunk = token.subtree
tar_check += 1
ls = []
for item in chunk:
ls.append(item.orth_)
for word in ls:
if (word.isdigit() or word in keywords['numbers']) and num_check==0:
kill_dict = {'fatalities':word}
tar_check += 1
num_check += 1
month_dict.update(kill_dict)
dict_holder.update(month_dict)
list_holder.append(dict_holder)
print(datetime.now()-starttime)
return list_holder