diff --git a/arabic_reshaper/__init__.py b/arabic_reshaper/__init__.py index 89ea012..f06d58b 100644 --- a/arabic_reshaper/__init__.py +++ b/arabic_reshaper/__init__.py @@ -1,6 +1,4 @@ -import os - -from .arabic_reshaper import reshape, default_reshaper, ArabicReshaper +from .arabic_reshaper import reshape, unreshape, default_reshaper, ArabicReshaper from .reshaper_config import (config_for_true_type_font, ENABLE_NO_LIGATURES, ENABLE_SENTENCES_LIGATURES, @@ -9,4 +7,4 @@ ENABLE_ALL_LIGATURES) -__version__ = '2.1.3' +__version__ = '2.2.0' diff --git a/arabic_reshaper/__version__.py b/arabic_reshaper/__version__.py index f811561..04188a1 100644 --- a/arabic_reshaper/__version__.py +++ b/arabic_reshaper/__version__.py @@ -1 +1 @@ -__version__ = '2.1.2' +__version__ = '2.2.0' diff --git a/arabic_reshaper/arabic_reshaper.py b/arabic_reshaper/arabic_reshaper.py index 4721a6a..ff9632c 100644 --- a/arabic_reshaper/arabic_reshaper.py +++ b/arabic_reshaper/arabic_reshaper.py @@ -15,7 +15,7 @@ from .ligatures import LIGATURES from .reshaper_config import auto_config -from .letters import (UNSHAPED, ISOLATED, TATWEEL, ZWJ, LETTERS_ARABIC, +from .letters import (UNSHAPED, ISOLATED, TATWEEL, ZWJ, LETTERS_ARABIC, SPECIAL_LETTERS, LETTERS_ARABIC_V2, LETTERS_KURDISH, FINAL, INITIAL, MEDIAL, connects_with_letters_before_and_after, connects_with_letter_before, connects_with_letter_after) @@ -152,8 +152,8 @@ def reshape(self, text): previous_letter[LETTER], self.letters): output.append((letter, isolated_form)) elif (previous_letter[FORM] == FINAL and not - connects_with_letters_before_and_after( - previous_letter[LETTER], self.letters + connects_with_letters_before_and_after( + previous_letter[LETTER], self.letters )): output.append((letter, isolated_form)) elif previous_letter[FORM] == isolated_form: @@ -238,6 +238,109 @@ def reshape(self, text): return ''.join(result) + def _reversed_letters(self) -> dict: + """ + Declared letters are for reshaping by default. This if for reversing declared letters and preparing them + for reverse reshaping + For example assume that we have a declared letter like: '\u0626': ('\uFE89', '\uFE8B', '\uFE8C', '\uFE8A') + We need to reverse it and change it to a dict like: { + '\uFE89': '\u0626', + '\uFE8B': '\u0626', + '\uFE8C': '\u0626', + '\uFE8A': '\u0626', + } + Now with this reversed letter dict, we can change char '\uFE89' to '\u0626' in text easily + """ + reversed_letters = {} + + # example for k: '\u0626' + # example for v: ('\uFE89', '\uFE8B', '\uFE8C', '\uFE8A'). so v is a tuple + for original_form, reshaped_form in self.letters.items(): + # Add char and its normal form as a dict, if char was not null (there is cases that char is null) + [reversed_letters.update({char: original_form}) for char in reshaped_form if char] + + return reversed_letters + + def _reversed_ligatures(self) -> dict: + """ + Declared ligatures are for reshaping by default. This if for reversing declared ligatures and preparing them + for reverse reshaping + For example assume that we have a declared ligatures like: ('ARABIC LIGATURE SAD WITH HAH', ( + '\u0635\u062D', ('\uFC20', '\uFCB1', '', ''), + )), + We need to reverse it and change it to a dict like: { + '\uFC20': '\u0635\u062D', + '\uFCB1': '\u0635\u062D', + } + Now with this reversed ligatures dict, we can change char '\uFC20' to '\u0635\u062D' in text easily + Actually first element of declared ligature would be ignored + """ + original_text = 0 + reshaped_text = 1 + + reversed_ligatures = {} + + # title example: 'ARABIC LIGATURE SAD WITH HAH' + # ligature example: ('\u0635\u062D', ('\uFC20', '\uFCB1', '', '')) + for title, ligature in LIGATURES: + [reversed_ligatures.update({char: ligature[original_text]}) for + char in ligature[reshaped_text] if char] + + return reversed_ligatures + + def unreshape(self, text: str) -> str: + """ + This is for reshaping from a reshaped text to original one + It iterates over each char of text + First checks if that char exists in original chars list. If that's so, + it means that char has not changed during reshape process and is equal to it's original shape. So that + would be appended to result without change + Second, checks if char is in SPECIAL_LETTERS. these letters are those which have generated during reshaping + process, and need to get replaced with their two chars + Third, checks if char is in ligatures. If that's so, would replace with original text + And finally, if any of mentioned conditions didn't meet, would try to find original shape of char, in + reversed_letters + """ + text_list = list(text) + result = [] + reversed_letters = self._reversed_letters() + reversed_ligatures = self._reversed_ligatures() + + for index, char in enumerate(text_list): + # Checking if char_ is in original shape letters list and has not changed during reshape + if char in self.letters.keys(): + result.append(char) + continue + + # Checking if char_ is a special character + if char in SPECIAL_LETTERS.keys(): + next_char = text_list[index+1] + next_next_char = text_list[index+2] + + # Checking if two following chars are harakat. in that case, first harakat should be placed in middle + # of special chars + if HARAKAT_RE.match(next_char) and HARAKAT_RE.match(next_next_char): + result.append(next_char.join(list(SPECIAL_LETTERS.get(char)))) + + # Omitting first harakat + text_list[index+1] = '' + + else: + result.append("".join(list(SPECIAL_LETTERS.get(char)))) + + continue + + # Checking if char_ is in ligatures + if char in reversed_ligatures: + result.append(reversed_ligatures[char]) + continue + + # If couldn't find char is letters, append char itself + result.append(reversed_letters.get(char, char)) + + return "".join(result) + default_reshaper = ArabicReshaper() reshape = default_reshaper.reshape +unreshape = default_reshaper.unreshape diff --git a/arabic_reshaper/letters.py b/arabic_reshaper/letters.py index e0ebd71..c570596 100644 --- a/arabic_reshaper/letters.py +++ b/arabic_reshaper/letters.py @@ -508,6 +508,18 @@ ZWJ: (ZWJ, ZWJ, ZWJ, ZWJ), } +# Special letters are like lam with alef with hamza. for reverse reshaping, we need this to reverse from reshaped +# lam with alef to regular one +SPECIAL_LETTERS = { + # lam alef + '\ufefc': ('\u0644', '\u0627'), + # lam with alef with hamza + '\ufef7': ('\u0644', '\u0623'), + # lam with alef with mad + '\ufef5': ('\u0644', '\u0622'), +} + + def connects_with_letter_before(letter,LETTERS): if letter not in LETTERS: return False diff --git a/arabic_reshaper/tests/test_004_unreshaping.py b/arabic_reshaper/tests/test_004_unreshaping.py new file mode 100644 index 0000000..f84b462 --- /dev/null +++ b/arabic_reshaper/tests/test_004_unreshaping.py @@ -0,0 +1,169 @@ +from __future__ import unicode_literals +from __future__ import print_function + +import unittest +import sys +import arabic_reshaper + + +def _unreshaping_test(test): + for i, case in enumerate(test.cases): + def t(): test.assertEqual(case[1], test.reshaper.unreshape(case[0])) + if hasattr(test, 'subTest'): + with test.subTest(i=i, case=case[0]): + t() + else: + print('running test case %d' % i, file=sys.stderr) + t() + + +class TestDefaultUnreshaping(unittest.TestCase): + def setUp(self): + self.reshaper = arabic_reshaper.default_reshaper + self.cases = ( + # Reshaped text, Unreshaped text + ('ﺍﻟﺴﻼﻡ ﻋﻠﻴﻜﻢ', 'السلام عليكم'), + ('ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ', 'اللغة العربية هي أكثر اللغات'), + ('ﺗﺤﺪﺛﺎ ﻭﻧﻄﻘﺎ ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ', 'تحدثا ونطقا ضمن مجموعة'), + ('ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ', 'اللغات السامية'), + ('ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ', 'العربية لغة رسمية في'), + ('ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ', 'كل دول الوطن العربي'), + ('ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ', 'إضافة إلى كونها لغة'), + ('ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ', 'رسمية في تشاد وإريتريا'), + ('ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ', 'وإسرائيل. وهي إحدى اللغات'), + ('ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ', 'الرسمية الست في منظمة'), + ('ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳﺤﺘﻔﻞ', 'الأمم المتحدة، ويحتفل'), + ('ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ', 'باليوم العالمي للغة العربية'), + ('ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ', 'في 18 ديسمبر كذكرى اعتماد'), + ('ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ', 'العربية بين لغات العمل في'), + ('ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.', 'الأمم المتحدة.'), + ('ﺍﻵﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.', 'الآمم المتحدة.'), + ('ﺍﺳﻢ ﻣﻦ amin ﺍﺳﺖ', 'اسم من amin است'), + ) + + def test_unreshaping(self): + _unreshaping_test(self) + + +class TestUnreshapingWithHarakat(unittest.TestCase): + def setUp(self): + self.reshaper = arabic_reshaper.ArabicReshaper({ + 'delete_harakat': False + }) + self.cases = ( + # Reshaped text, Unreshaped text + ('ﺍﻟﺴَﻼَْﻡٌ ﻋَﻠَﻴْﻜُﻢْ', 'السَلَاْمٌ عَلَيْكُمْ'), + ('ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ', 'اللغة العربية هي أكثر اللغات'), + ('ﺗﺤﺪﺛﺎً ﻭﻧﻄﻘﺎً ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ', 'تحدثاً ونطقاً ضمن مجموعة'), + ('ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ', 'اللغات السامية'), + ('ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ', 'العربية لغة رسمية في'), + ('ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ', 'كل دول الوطن العربي'), + ('ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ', 'إضافة إلى كونها لغة'), + ('ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ', 'رسمية في تشاد وإريتريا'), + ('ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ', 'وإسرائيل. وهي إحدى اللغات'), + ('ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ', 'الرسمية الست في منظمة'), + ('ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳُﺤﺘﻔﻞ', 'الأمم المتحدة، ويُحتفل'), + ('ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ', 'باليوم العالمي للغة العربية'), + ('ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ', 'في 18 ديسمبر كذكرى اعتماد'), + ('ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ', 'العربية بين لغات العمل في'), + ('ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.', 'الأمم المتحدة.'), + ) + + def test_unreshaping(self): + _unreshaping_test(self) + + +class TestUnreshapingWithHarakatWithoutLigatures(unittest.TestCase): + def setUp(self): + self.reshaper = arabic_reshaper.ArabicReshaper({ + 'delete_harakat': False, + 'support_ligatures': False + }) + self.cases = ( + # Reshaped text, Unreshaped text + ('ﺍﻟﺴَﻠَﺎْﻡٌ ﻋَﻠَﻴْﻜُﻢْ', 'السَلَاْمٌ عَلَيْكُمْ'), + ('ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ', 'اللغة العربية هي أكثر اللغات'), + ('ﺗﺤﺪﺛﺎً ﻭﻧﻄﻘﺎً ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ', 'تحدثاً ونطقاً ضمن مجموعة'), + ('ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ', 'اللغات السامية'), + ('ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ', 'العربية لغة رسمية في'), + ('ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ', 'كل دول الوطن العربي'), + ('ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ', 'إضافة إلى كونها لغة'), + ('ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ', 'رسمية في تشاد وإريتريا'), + ('ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ', 'وإسرائيل. وهي إحدى اللغات'), + ('ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ', 'الرسمية الست في منظمة'), + ('ﺍﻟﺄﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳُﺤﺘﻔﻞ', 'الأمم المتحدة، ويُحتفل'), + ('ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ', 'باليوم العالمي للغة العربية'), + ('ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ', 'في 18 ديسمبر كذكرى اعتماد'), + ('ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ', 'العربية بين لغات العمل في'), + ('ﺍﻟﺄﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.', 'الأمم المتحدة.'), + ) + + def test_unreshaping(self): + _unreshaping_test(self) + + +class TestUnreshapingLigatures(unittest.TestCase): + def setUp(self): + self.reshaper = arabic_reshaper.ArabicReshaper({ + 'delete_tatweel': True, + 'ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM': True, + 'ARABIC LIGATURE JALLAJALALOUHOU': True, + 'ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM': True, + 'ARABIC LIGATURE ALLAH ': True, + 'ARABIC LIGATURE AKBAR': True, + 'ARABIC LIGATURE ALAYHE': True, + 'ARABIC LIGATURE MOHAMMAD': True, + 'ARABIC LIGATURE RASOUL': True, + 'ARABIC LIGATURE SALAM': True, + 'ARABIC LIGATURE SALLA': True, + 'ARABIC LIGATURE WASALLAM': True, + }) + self.cases = ( + # Reshaped text, Unreshaped text + ( + 'ﺇﻧﻪ ﻣﻦ ﺳﻠﻴﻤﺎﻥ ﻭﺇﻧﻪ ﷽ ﴿٣٠﴾ ' + 'ﺃﻻ ﺗﻌﻠﻮﺍ ﻋﻠﻲ ﻭﺃﺗﻮﻧﻲ ﻣﺴﻠﻤﻴﻦ ﴿٣١﴾', + + 'إنه من سليمان وإنه بسم الله الرحمن الرحيم ﴿٣٠﴾ ' + 'ألا تعلوا علي وأتوني مسلمين ﴿٣١﴾' + ), + ( + 'ﻓﺬﻛﺮ ﺇﻧﻤﺎ ﺃﻧﺖ' + ' ﻣﺬﻛﺮ ﴿٢١﴾ ﻟﺴﺖ' + ' ﻋﻠﻴﻬﻢ ﺑﻤﺼﻴﻄﺮ ﴿٢٢﴾' + ' ﺇﻻ ﻣﻦ ﺗﻮﻟﻰ' + ' ﻭﻛﻔﺮ ﴿٢٣﴾ ﻓﻴﻌﺬﺑﻪ' + ' ﷲ ﺍﻟﻌﺬﺍﺏ' + ' ﺍﻷﻛﺒﺮ ﴿٢٤﴾', + + 'فذكر إنما أنت' + ' مذكر ﴿٢١﴾ لست' + ' عليهم بمصيطر ﴿٢٢﴾' + ' إلا من تولى' + ' وكفر ﴿٢٣﴾ فيعذبه' + ' الله العذاب' + ' الأكبر ﴿٢٤﴾', + ), + + ( + 'ﷴ ﷶ ﷲ ﷺ', + 'محمد رسول الله صلى الله عليه وسلم', + ), + + ( + 'ﷲ ﷻ', + 'الله جل جلاله', + ), + + ( + 'ﷴ ﷶ ﷲ ﷷ ﷹ ﷲ ﷸ', + 'محمد رسول الله عليه صلى الله وسلم', + ), + ) + + def test_unreshaping(self): + _unreshaping_test(self) + + +if __name__ == '__main__': + unittest.main() diff --git a/setup.py b/setup.py index b9ce8f3..fbee068 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ " applications that don't support Arabic"), long_description=long_description, long_description_content_type="text/markdown", - version='2.1.3', + version='2.2.0', platforms="ALL", license="MIT", packages=['arabic_reshaper'],