-
Notifications
You must be signed in to change notification settings - Fork 5
/
build-morfologik-lt.sh
executable file
·77 lines (59 loc) · 3.44 KB
/
build-morfologik-lt.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/bin/bash
cd morfologik-lt
#LanguageTool jar
jarfile=~/target-lt/languagetool.jar
target_dir=../resultats/java-lt/src/main/resources/org/languagetool/resource/ca
rm $target_dir/*
#source dictionaries
# catalan
cp ../resultats/lt/diccionari.txt /tmp/ca-ES.txt
#sed -i '/ VMIP1S0S/d' /tmp/ca-ES.txt
#catalan including DNV
cat ../resultats/lt/diccionari.txt ../resultats/lt/diccionari-dnv.txt > /tmp/ca-ES-valencia.txt
#sed -i '/ VMIP1S0S/d' /tmp/ca-ES-valencia.txt
sort -u /tmp/ca-ES-valencia.txt -o /tmp/ca-ES-valencia.txt
for targetdict in ca-ES ca-ES-valencia
do
cp tagger-spelling.masterinfo ${targetdict}.info
cp tagger-spelling.masterinfo ${targetdict}_spelling.info
cp synth.masterinfo ${targetdict}_synth.info
# exclude some words for LT dictionary
sed -i -E '/ (aguar|ciar|emblar|binar) /d' /tmp/${targetdict}.txt
# replace whitespaces with tabs
perl sptotabs.pl </tmp/${targetdict}.txt >${targetdict}_tabs.txt
export LC_ALL=C && sort -u ${targetdict}_tabs.txt -o ${targetdict}_tabs.txt
# create tagger dictionary with morfologik tools
java -cp $jarfile org.languagetool.tools.POSDictionaryBuilder -i ${targetdict}_tabs.txt -info ${targetdict}.info -freq ca_wordlist.xml -o ${targetdict}.dict
# dump the tagger dictionary
java -cp $jarfile org.languagetool.tools.DictionaryExporter -i ${targetdict}.dict -info ${targetdict}.info -o ${targetdict}_lt.txt
# create synthesis dictionary with morfologik tools
java -cp $jarfile org.languagetool.tools.SynthDictionaryBuilder -i ${targetdict}_tabs.txt -info ${targetdict}_synth.info -o ${targetdict}_synth.dict
#cp /tmp/SynthDictionaryBuilder*_tags.txt ./${targetdict}_tags.txt
#rm /tmp/SynthDictionaryBuilder*_tags.txt
# dump synthesis dictionary
java -cp $jarfile org.languagetool.tools.DictionaryExporter -i ${targetdict}_synth.dict -o ${targetdict}_synth_lt.txt -info ${targetdict}_synth.info
# spelling dicts (alternative)
cp tagger-spelling.masterinfo ${targetdict}_spelling.info
perl -i -p -e 's/^(.+)\t.+\t.+$/$1/' ${targetdict}_tabs.txt
cat ../extra-spelling/extra-spelling.txt ${targetdict}_tabs.txt > ${targetdict}_spelling.txt
export LC_ALL=C && sort -u ${targetdict}_spelling.txt -o ${targetdict}_spelling.txt
java -cp $jarfile org.languagetool.tools.SpellDictionaryBuilder -i ${targetdict}_spelling.txt -freq ca_wordlist.xml -info ca-ES_spelling.info -o ${targetdict}_spelling.dict
java -cp $jarfile org.languagetool.tools.DictionaryExporter -i ${targetdict}_spelling.dict -info ${targetdict}_spelling.info -o ${targetdict}_spelling_lt.txt
mv ${targetdict}_synth.dict_tags.txt ${targetdict}_tags.txt
cp ${targetdict}_spelling.dict $target_dir
cp ${targetdict}_spelling.info $target_dir/${targetdict}_spelling.info
cp ${targetdict}_tags.txt $target_dir
cp ${targetdict}.dict $target_dir
cp ${targetdict}.info $target_dir
cp ${targetdict}_synth.dict $target_dir
cp ${targetdict}_synth.info $target_dir
rm ${targetdict}_tabs.txt
done
rm *.info
exit
# extra spelling dict
#java -cp $jarfile org.languagetool.tools.SpellDictionaryBuilder -i ../extra-spelling/extra-spelling.txt -freq ca_wordlist.xml -info ca-ES.info -o ca-extra-spelling.dict
# dump the extra-spelling dict
#java -cp $jarfile org.languagetool.tools.DictionaryExporter -i ca-extra-spelling.dict -info ca-ES.info -o ca-extra-spelling_lt.txt
#cp ca-extra-spelling.dict $target_dir
#cp ca-ES.info $target_dir/ca-extra-spelling.info