-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
75 lines (56 loc) · 1.65 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding: utf-8 -*-
import os
import re
import fasttext
import langdetect
import langid
CURRENT_PATH = os.path.abspath(os.getcwd())
FASTTEXT_MODEL_PATH = os.path.join(CURRENT_PATH,
'tmp',
'lid.176.ftz')
# Mapping from ISO 639-1 language codes to ISO 639-3
LANGUAGE_MAPPING = {
'cs': 'ces',
'da': 'dan',
'nl': 'nld',
'en': 'eng',
'fi': 'fin',
'fr': 'fra',
'de': 'deu',
'hu': 'hun',
'it': 'ita',
'pl': 'pol',
'pt': 'por',
'ro': 'ron',
'es': 'spa',
'sv': 'swe'
}
UNKNOWN_LANGUAGE = '(unk)'
class LanguageIdentificationHelper(object):
def __init__(self, library):
self.library = library
if self.library == 'fasttext':
self.load_model()
def load_model(self):
self.ft_model = fasttext.load_model(FASTTEXT_MODEL_PATH)
def predict(self, text):
lang = None
if self.library == 'langid':
lang, _ = langid.classify(text)
elif self.library == 'langdetect':
lang = langdetect.detect(text)
elif self.library == 'fasttext':
text = [text] if isinstance(text, str) else text
pred = self.ft_model.predict(text, k=1)
pred = pred[0]
lang = [
re.sub(r'^[a-z_]+([a-z]{2})$', r'\1', y[0])
for y in pred
]
lang = [lang] if isinstance(lang, str) else lang
lang = [
(LANGUAGE_MAPPING[y] if y in LANGUAGE_MAPPING else UNKNOWN_LANGUAGE)
for y in lang
]
lang = lang[0] if len(lang) == 1 else lang
return lang