forked from languagetool-org/english-pos-dict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
build-dicts-from-src.sh
executable file
·70 lines (55 loc) · 2.53 KB
/
build-dicts-from-src.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/bin/bash
for file in ./src-dict/src-pending.txt ./src-dict/src-clean.txt
do
export LC_ALL=C && sort -u $file -o $file
done
python3 src-dict/check-syntax.py
python3 src-dict/build-all-dicts.py
export LC_ALL=C && sort -u -o ./src-dict/output/tagger-dictionary.txt ./src-dict/output/tagger-dictionary.txt
for variant in AU CA GB NZ US ZA ALL
do
export LC_ALL=C && sort -u -o ./src-dict/output/en_${variant}.txt ./src-dict/output/en_${variant}.txt
done
lt_version=6.5-SNAPSHOT
lt_tools=../languagetool/languagetool-standalone/target/LanguageTool-${lt_version}/LanguageTool-${lt_version}/languagetool.jar
#to be updated
#src_dict=./original-files-december-2023
src_dict=../languagetool/languagetool-language-modules/en/src/main/resources/org/languagetool/resource/en
target_dir=src/main/resources/org/languagetool/resource/en
rm -rf tmp
mkdir tmp
cp ./src-dict/output/tagger-dictionary.txt ./tmp/
cp ./info-files/english.info ${target_dir}/english.info
#create tagger binary
java -cp $lt_tools org.languagetool.tools.POSDictionaryBuilder -i ./tmp/tagger-dictionary.txt -info ./info-files/english.info -o ${target_dir}/english.dict
#create synth binary
cp $src_dict/filter-archaic.txt ./tmp/
cp $src_dict/do-not-synthesize.txt ./tmp/
cp ./info-files/english_synth.info ./tmp/
cp ./info-files/english_synth.info ./${target_dir}/english_synth.info
cd ./tmp
sed -i 's/#.*$//' do-not-synthesize.txt
sed -i '/^$/d' do-not-synthesize.txt
# filter-archaic.txt has to be in the folder of english_synth.info
cat do-not-synthesize.txt >> filter-archaic.txt
java -cp ../$lt_tools org.languagetool.tools.SynthDictionaryBuilder -i tagger-dictionary.txt -info english_synth.info -o ../${target_dir}/english_synth.dict
cd -
mv ${target_dir}/english_synth.dict_tags.txt ${target_dir}/english_tags.txt
#create spelling binaries
for variant in AU CA GB NZ US ZA
do
freqlist=./spell-data/freq/en_wordlist.xml
if [ "$variant" = "GB" ]; then
freqlist=./spell-data/freq/en_gb_wordlist.xml
fi
if [ "$variant" = "US" ]; then
freqlist=./spell-data/freq/en_us_wordlist.xml
fi
echo "${variant} ${freqlist}"
cp ./info-files/en_${variant}.info ${target_dir}/hunspell/en_${variant}.info
cat ./src-dict/output/en_${variant}.txt ./src-dict/output/en_ALL.txt > ./tmp/en_${variant}.txt
export LC_ALL=C && sort -u -o ./tmp/en_${variant}.txt ./tmp/en_${variant}.txt
java -cp $lt_tools org.languagetool.tools.SpellDictionaryBuilder -i ./tmp/en_${variant}.txt -freq ${freqlist} -info ${target_dir}/hunspell/en_${variant}.info -o ${target_dir}/hunspell/en_${variant}.dict
done
rm -rf tmp
mvn install