-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcorpus_info.py
121 lines (86 loc) · 2.62 KB
/
corpus_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# coding: utf8
import tabulate
import collections
from pathlib import Path
def summarize(corpus):
gold = Path(corpus) / 'gold'
inpt = Path(corpus) / 'input'
entities = collections.Counter()
relations = collections.Counter()
sentences = 0
input_files = {}
b_files = {}
c_files = {}
for fp in inpt.glob('**/input_*'):
input_files[fp.name] = fp
files = len(input_files)
for fp in input_files.values():
sentences += len([l for l in fp.open() if l.strip()])
for fp in gold.glob('**/output_B*'):
b_files[fp.name] = fp
for fp in b_files.values():
entities.update([line.strip().split()[-1] for line in fp.open()])
entities['total'] = sum(entities.values())
for fp in gold.glob('**/output_C*'):
c_files[fp.name] = fp
for fp in c_files.values():
relations.update([line.strip().split()[0] for line in fp.open()])
relations['total'] = sum(relations.values())
relations['relations'] = sum(relations[k] for k in 'part-of property-of is-a same-as'.split())
relations['roles'] = sum(relations[k] for k in 'subject target'.split())
return {
'files': files,
'sentences': sentences,
'annotations': entities['total'] + relations['total'],
'entities': entities,
'relations': relations,
}
def _add(d1, d2):
result = {}
for k,v1 in d1.items():
v2 = d2[k]
if isinstance(v1, dict):
result[k] = _add(v1, v2)
else:
result[k] = v1 + v2
return result
def _add_many(*dicts):
dicts = list(dicts)
result = dicts.pop()
while dicts:
result = _add(result, dicts.pop())
return result
def _get_key(key, d):
key_parts = key.split(".")
for part in key_parts:
d = d[part]
return d
def table():
trial = summarize('trial')
training = summarize('training')
develop = summarize('develop')
test = summarize('test')
totals = _add_many(trial, training, develop, test)
keys = [
"files",
"sentences",
"annotations",
"entities.total",
"entities.Concept",
"entities.Action",
"relations.roles",
"relations.subject",
"relations.target",
"relations.relations",
"relations.is-a",
"relations.part-of",
"relations.property-of",
"relations.same-as"
]
dicts = [totals, trial, training, develop, test]
rows = []
for key in keys:
rows.append([_get_key(key, d) for d in dicts])
return tabulate.tabulate(rows, tablefmt='latex')
if __name__ == '__main__':
print(table())