-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathxml2conll.sh
59 lines (57 loc) · 1.42 KB
/
xml2conll.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/bin/bash
# pdftohtml.xml to txt for MLG Anselm
# (this is domain and language specific, hence not in cmd)
cat $* | \
iconv -f utf-8 -t utf-8 | \
grep '<text[^>]*font="1"' | \
perl -pe 's/([^ ])(<\/text>)/$1-$2/;' | \
sed -e s/'<[^>]*>'//g \
-e s/'<'//g \
-e s/'>'//g \
-e s/'[()*]'//g \
-e s/'\[\([^0-9]*\)\]'/'\1'/g \
-e s/'\[.*\]'//g | \
perl -pe 's/\s+/ /gs;' |\
sed -e s/'[=-] '//g \
-e s/' *'/'\n'/g | \
# -e s/' *\([\/\\.;?!:]\) *'/' \1 '/g \
# add first column with normalized orthography
perl -e '
while(<>) {
s/\n/\t/g;
print $_; # original string, incl. punctuation, diacritics, capital case, etc.
s/(.)/\l$1/g; # lowercase, ascii, remove duplicate graphemes, remove punctuation
s/ā/a/g; # (not language-specific)
s/ö/o/g;
s/ü/u/g;
s/ü/u/g;
s/[ëė]/e/g;
s/ſ/s/g;
s/ʒ/z/g;
s/[ýẏÿẏẏÿẏÿẏẏẏÿȳ]/y/g;
s/[^\ta-z]+//g;
while(m/.*(.)\1.*/) {
s/(.)\1/\1/g; # remove duplicates
}
print; # out: "regular" normalized MLG
# simplified orthography (for matching only, language-specific)
s/cz/tz/g;
s/oe/o/g;
s/ae/a/g;
s/ao/a/g;
s/ue/u/g;
s/[jy]/i/g;
s/c?k/c/g;
s/uw/u/g;
s/[fwv]/u/g;
s/sch/sc/g;
s/th/t/g;
s/[cg]h/g/g;
s/^ih/i/g;
s/z/s/g;
s/dt/d/g;
s/d\t/t\t/g; # auslautverhärtung, often written dt => d, e.g., godt
print;
print "\n";
}' | \
sed -e s/'^\([^\t]*\)\t\([^\t]*\)\t\([^\t]*\)\t$'/'\3\t\1\t\2'/g