-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
88 lines (84 loc) · 2.36 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def get_phones(epi, text):
"""
Returns a list of IPA segments
* Note that one IPA segment may span several unicode chars.
"""
phones = epi.transliterate(text)
return phones
def get_phone_segments(epi, text):
"""
Returns a list of IPA segments
* Note that one IPA segment may span several unicode chars.
"""
phones = epi.trans_list(text)
return phones
def get_lookup():
"""
Note that we are maping both Persian and Farsi to the farsilanguage code..
"""
return {
"Afar": "aar-Latn",
"Amharic": "amh-Ethi",
"Bengali": "ben-Beng",
"Catalan": "cat-Latn",
"Cebuano": "ceb-Latn",
"Mandarin-Simplified": "cmn-Hans",
"Mandarin-Traditional": "cmn-Hant",
"Sorani": "ckb-Arab",
"German": "deu-Latn",
"English": "eng-Latn",
"Farsi": "fas-Arab",
"Persian": "fas-Arab",
"French": "fra-Latn",
"Hausa": "hau-Latn",
"Hindi": "hin-Deva",
"Hungarian": "hun-Latn",
"Ilocano": "ilo-Latn",
"Indonesian": "ind-Latn",
"Italian": "ita-Latn",
"Javanese": "jav-Latn",
"Kazakh-Cyrillic": "kaz-Cyrl",
"Kazakh-Latin": "kaz-Latn",
"Kinyarwanda": "kin-Latn",
"Kyrgyz-Perso-Arabic": "kir-Arab",
"Kyrgyz-Cyrillic": "kir-Cyrl",
"Kyrgyz-Latin": "kir-Latn",
"Kurmanji": "kmr-Latn",
"Lao": "lao-Laoo",
"Marathi": "mar-Deva",
"Burmese": "mya-Mymr",
"Malay": "msa-Latn",
"Dutch": "nld-Latn",
"Chichewa": "nya-Latn",
"Oromo": "orm-Latn",
"Punjabi": "pan-Guru",
"Polish": "pol-Latn",
"Portuguese": "por-Latn",
"Russian": "rus-Cyrl",
"Shona": "sna-Latn",
"Somali": "som-Latn",
"Spanish": "spa-Latn",
"Swahili": "swa-Latn",
"Swedish": "swe-Latn",
"Tamil": "tam-Taml",
"Telugu": "tel-Telu",
"Tajik": "tgk-Cyrl",
"Tagalog": "tgl-Latn",
"Thai": "tha-Thai",
"Tigrinya": "tir-Ethi",
"Ukrainian": "ukr-Cyrl",
"Uyghur-Perso-Arabic)": "uig-Arab",
"Uzbek-Cyrillic)": "uzb-Cyrl",
"Uzbek-Latin": "uzb-Latn",
"Vietnamese": "vie-Latn",
"Xhosa": "xho-Latn",
"Yoruba": "yor-Latn",
"Zulu": "zul-Latn",
}
def lang2ISO(lang):
"""
Convert a given language to its ISO code for lookup in epitran.
"""
# Most of the languages in epitran, per their README
lookup = {k.lower(): v for k, v in get_lookup().items()}
return(lookup[lang.lower()])