-
Notifications
You must be signed in to change notification settings - Fork 0
/
tok2conll.py
116 lines (83 loc) · 3.8 KB
/
tok2conll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Script to serialize automatic parses for .tok files
"""
import io, os, sys, re
from glob import glob
from argparse import ArgumentParser
from EnsembleSentencer import EnsembleSentencer
from lib.exec import exec_via_temp
script_dir = os.path.dirname(os.path.realpath(__file__))
lib = os.path.abspath(script_dir + os.sep + "lib")
class Tok2Conll:
def __init__(self,lang="eng",model="eng.rst.gum"):
self.lang = lang
lang_map = {"deu":"german","eng":"english","spa":"spanish","fra":"french","nld":"dutch","rus":"russian","eus":"basque","por":"portuguese","zho":"chinese", "tur":"turkish"}
self.long_lang = lang_map[lang] if lang in lang_map else lang
self.genre_pat = "^(..)"
try:
self.udpipe_model = glob(os.path.abspath(os.path.join(lib,"udpipe",self.long_lang+"*.udpipe")))[0]
except:
sys.stderr.write("! Model not found for language " + self.long_lang + "*.udpipe in " + os.path.abspath(os.path.join([lib,"udpipe",self.long_lang+"*.udpipe"]))+"\n")
sys.exit(0)
self.udpipe_path = os.path.abspath(os.path.join(lib,"udpipe")) + os.sep
def run_udpipe(self,text):
cmd = [self.udpipe_path + "udpipe","--tag", "--parse", self.udpipe_model,"tempfilename"]
parsed = exec_via_temp(text, cmd, workdir=self.udpipe_path)
return parsed
if __name__ == "__main__":
p = ArgumentParser()
p.add_argument("-c","--corpus",default="spa.rst.sctb",help="Corpus name to parse or 'all' for all corpora")
p.add_argument("-d","--data_dir",default=os.path.normpath("../data"),help="Path to shared task data folder")
opts = p.parse_args()
specific_corpus = opts.corpus
data_dir = opts.data_dir
corpora = os.listdir(data_dir)
if specific_corpus == "all":
corpora = [c for c in corpora if os.path.isdir(os.path.join(data_dir, c))]
else:
corpora = [c for c in corpora if os.path.isdir(os.path.join(data_dir, c)) and c== specific_corpus]
for corpus in corpora:
lang = corpus.split(".")[0]
# Make Ensemble Sentencer
e = EnsembleSentencer(model=corpus,lang=lang)
# Special genre patterns
if "gum" in corpus:
e.genre_pat = "GUM_(.+)_.*"
# loop through all *.tok files in the directory
for f in glob(os.path.join(data_dir,corpus, corpus + "*.tok")):
# predict sentence boundary
predicted = e.predict(f, eval_gold=False, as_text=False).tolist()
# read *.tok files and separate tokens by predicted sentence breaks (re-indexing from 1 for each sentence)
conllu = io.open(f, encoding="utf8").readlines()
i_pred = 0
splitted = ''
for l in conllu:
if '\t' in l:
if predicted[i_pred] == 1:
tok_id = 1
new_l = '\t'.join([str(tok_id)] + [l.split('\t')[1]] + ['_'] * 7 + [l.split('\t')[9]])
splitted += '\n' + new_l
else:
new_l = '\t'.join([str(tok_id)] + [l.split('\t')[1]] + ['_'] * 7 + [l.split('\t')[9]])
splitted += new_l
tok_id += 1
i_pred += 1
else:
splitted += l
tok_id = 1
# fix extra line breaks
splitted = re.sub(r'(#[^\n\t]+\n)\n', r'\1', splitted)
# run udpipe tagger & parser on the sentence-splitted conllu text
t2c = Tok2Conll(lang=lang,model=corpus)
output = t2c.run_udpipe(splitted)
# assert that the parsed *.tok file has the same amount tokens as the original
assert len([x for x in conllu if '\t' in x]) == len([x for x in output.split('\n') if '\t' in x])
# save parsed *.tok files to corresponding subdirectories in data_parsed/
data_parsed_dir = script_dir + os.sep + ".." + os.sep + "data_parsed" + os.sep + corpus
if not os.path.exists(data_parsed_dir):
os.makedirs(data_parsed_dir)
with io.open(data_parsed_dir + os.sep + os.path.basename(f).replace(".tok",".conll"), 'w', encoding='utf8') as fout:
fout.write(output)
sys.stderr.write("o Saved parsed " + os.path.basename(f).replace(".tok",".conll") + " to data_parsed/ directory \n")