forked from spyysalo/standoff2conll
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconll2standoff.py
executable file
·89 lines (66 loc) · 1.92 KB
/
conll2standoff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python
# coding: utf8
"""
Convert CoNLL to BioNLP standoff.
Accept IO, IOB, and IOBES tags.
Be graceful about tag-sequence errors.
Input needs at least 4 tab-separated columns:
token, start offset, end offset, tag [, more...]
"""
from __future__ import unicode_literals
import sys
import codecs
from document import Document
def main():
'''
Run as script.
'''
conll2standoff(codecs.getreader('utf8')(sys.stdin),
codecs.getwriter('utf8')(sys.stdout))
def conll2standoff(src, tgt):
"""Convert CoNLL with character offsets to BioNLP standoff."""
rows = parse_conll(src)
rows = reformat(rows)
text = '\n'.join('\t'.join(r) for r in rows) + '\n'
doc = Document.from_nersuite(text)
tgt.write(doc.to_standoff())
def parse_conll(lines):
"""Parse CoNLL TSV."""
for line in lines:
if not line.lower().startswith('# doc_id'):
line = line.rstrip()
if line:
yield line.split('\t')
else:
yield []
def reformat(rows):
"""Reformat CoNLL input to well-formed NERSuite format."""
last = OUTSIDE[0]
for row in rows:
if not row:
last = OUTSIDE[0]
else:
# Flip token/tag and sanitise the tag sequence.
tag = row[3]
tag = last = fix_tag(tag, last)
row[3] = row[0]
row[0] = tag
yield row
OUTSIDE = ('O', 'O-NIL')
INSIDE = ('I', 'E')
BEGIN = ('B', 'S')
def fix_tag(tag, last):
"""Ensure a valid IOB sequence."""
if tag in OUTSIDE:
tag = OUTSIDE[0]
else:
# Definitely use B or I, including something like "O-chemical".
tag, label = tag[0], tag[1:]
if tag in BEGIN or last in OUTSIDE or last[1:] != label:
tag = BEGIN[0]
else:
tag = INSIDE[0]
tag += label # rejoin
return tag
if __name__ == '__main__':
main()