diff --git a/docs/source/modules/datasets.rst b/docs/source/modules/datasets.rst index 9206ee6123..6f6f2a530d 100644 --- a/docs/source/modules/datasets.rst +++ b/docs/source/modules/datasets.rst @@ -157,6 +157,12 @@ of vocabs. * - hebrew - 123 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿אבגדהוזחטיכלמנסעפצקרשת₪ + * - hindi + - 71 + - अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥॰ + * - bangla + - 70 + - অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ০১২৩৪৫৬৭৮৯ * - multilingual - 195 - english & french & german & italian & spanish & portuguese & czech & polish & dutch & norwegian & danish & finnish & swedish & § diff --git a/doctr/datasets/vocabs.py b/doctr/datasets/vocabs.py index ddc32d8665..f682560d1d 100644 --- a/doctr/datasets/vocabs.py +++ b/doctr/datasets/vocabs.py @@ -17,9 +17,14 @@ "ancient_greek": "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ", "arabic_letters": "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي", "persian_letters": "پچڢڤگ", - "hindi_digits": "٠١٢٣٤٥٦٧٨٩", + "arabic_digits": "٠١٢٣٤٥٦٧٨٩", "arabic_diacritics": "ًٌٍَُِّْ", "arabic_punctuation": "؟؛«»—", + "hindi_letters": "अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह", + "hindi_digits": "०१२३४५६७८९", + "hindi_punctuation": "।,?!:्ॐ॰॥॰", + "bangla_letters": "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ", + "bangla_digits": "০১২৩৪৫৬৭৮৯", } VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"] @@ -32,7 +37,7 @@ VOCABS["german"] = VOCABS["english"] + "äöüßÄÖÜẞ" VOCABS["arabic"] = ( VOCABS["digits"] - + VOCABS["hindi_digits"] + + VOCABS["arabic_digits"] + VOCABS["arabic_letters"] + VOCABS["persian_letters"] + VOCABS["arabic_diacritics"] @@ -52,6 +57,8 @@ + "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ" ) VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪" +VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"] +VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"] VOCABS["multilingual"] = "".join( dict.fromkeys( VOCABS["french"]