-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranslations.py
64 lines (47 loc) · 2.5 KB
/
translations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import babelnet as bn
import nltk
import pickle
from babelnet.language import Language
from babelnet.resources import WordNetSynsetID
from babelnet.resources import BabelSynsetID
from nltk.corpus import wordnet
class GetBabelNetTranslations:
def __init__(self):
self.save_file = 'generated_files/all_wn_translations.pkl'
def get_all_wn_offsets(self):
"""gets and formats the offsets for all WordNet synsets"""
all_offsets = [f"{synset.offset():08}{'a' if synset.pos() == 's' else synset.pos()}" for synset in
wordnet.all_synsets()]
return all_offsets
def get_translations(self):
"""gets the translations of all WordNet synsets from BabelNet"""
offsets = self.get_all_wn_offsets()
with open('select_languages.pkl', 'rb') as f:
select_languages = pickle.load(f)
langauge_names = [lang for lang in select_languages.keys()]
to_languages = [Language.from_iso(iso_code) for iso_code in select_languages.values()]
synsets_translation = {language: {} for language in langauge_names}
for offset_count, offset in enumerate(offsets[len(synsets_translation[langauge_names[0]]):]):
wn_synset = wordnet.synset_from_pos_and_offset(offset[-1], int(offset[:8]))
bn_id = str(bn.get_synset(WordNetSynsetID(f'wn:{offset}')).id)
# bn can only get translate synsets to 3 languages at a time
for i in range(((len(langauge_names) - 1) // 3) + 1):
three_langs_names = langauge_names[i * 3: (i + 1) * 3]
three_to_langs = to_languages[i * 3: (i + 1) * 3]
bn_synset = bn.get_synset(BabelSynsetID(bn_id), to_langs=three_to_langs)
for j, language in enumerate(three_to_langs):
lang_translations = [str(lemma) for lemma in bn_synset.lemmas(language)]
synsets_translation[three_langs_names[j]][wn_synset.name()] = lang_translations
if offset_count % 5000 == 0:
with open(self.save_file, 'wb') as f:
pickle.dump(synsets_translation, f)
print(f'{len(synsets_translation[langauge_names[0]])}/{len(offsets)}')
# save once more at the very end
with open(self.save_file, 'wb') as f:
pickle.dump(synsets_translation, f)
def main():
"""gets all BabelNet translations"""
bn_translations = GetBabelNetTranslations()
bn_translations.get_translations()
if __name__ == '__main__':
main()