Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: unreshape (rolling back to original text) #75

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions arabic_reshaper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import os

from .arabic_reshaper import reshape, default_reshaper, ArabicReshaper
from .arabic_reshaper import reshape, unreshape, default_reshaper, ArabicReshaper
from .reshaper_config import (config_for_true_type_font,
ENABLE_NO_LIGATURES,
ENABLE_SENTENCES_LIGATURES,
Expand All @@ -9,4 +7,4 @@
ENABLE_ALL_LIGATURES)


__version__ = '2.1.3'
__version__ = '2.2.0'
2 changes: 1 addition & 1 deletion arabic_reshaper/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.1.2'
__version__ = '2.2.0'
109 changes: 106 additions & 3 deletions arabic_reshaper/arabic_reshaper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from .ligatures import LIGATURES
from .reshaper_config import auto_config
from .letters import (UNSHAPED, ISOLATED, TATWEEL, ZWJ, LETTERS_ARABIC,
from .letters import (UNSHAPED, ISOLATED, TATWEEL, ZWJ, LETTERS_ARABIC, SPECIAL_LETTERS,
LETTERS_ARABIC_V2, LETTERS_KURDISH, FINAL,
INITIAL, MEDIAL, connects_with_letters_before_and_after,
connects_with_letter_before, connects_with_letter_after)
Expand Down Expand Up @@ -152,8 +152,8 @@ def reshape(self, text):
previous_letter[LETTER], self.letters):
output.append((letter, isolated_form))
elif (previous_letter[FORM] == FINAL and not
connects_with_letters_before_and_after(
previous_letter[LETTER], self.letters
connects_with_letters_before_and_after(
previous_letter[LETTER], self.letters
)):
output.append((letter, isolated_form))
elif previous_letter[FORM] == isolated_form:
Expand Down Expand Up @@ -238,6 +238,109 @@ def reshape(self, text):

return ''.join(result)

def _reversed_letters(self) -> dict:
"""
Declared letters are for reshaping by default. This if for reversing declared letters and preparing them
for reverse reshaping
For example assume that we have a declared letter like: '\u0626': ('\uFE89', '\uFE8B', '\uFE8C', '\uFE8A')
We need to reverse it and change it to a dict like: {
'\uFE89': '\u0626',
'\uFE8B': '\u0626',
'\uFE8C': '\u0626',
'\uFE8A': '\u0626',
}
Now with this reversed letter dict, we can change char '\uFE89' to '\u0626' in text easily
"""
reversed_letters = {}

# example for k: '\u0626'
# example for v: ('\uFE89', '\uFE8B', '\uFE8C', '\uFE8A'). so v is a tuple
for original_form, reshaped_form in self.letters.items():
# Add char and its normal form as a dict, if char was not null (there is cases that char is null)
[reversed_letters.update({char: original_form}) for char in reshaped_form if char]

return reversed_letters

def _reversed_ligatures(self) -> dict:
"""
Declared ligatures are for reshaping by default. This if for reversing declared ligatures and preparing them
for reverse reshaping
For example assume that we have a declared ligatures like: ('ARABIC LIGATURE SAD WITH HAH', (
'\u0635\u062D', ('\uFC20', '\uFCB1', '', ''),
)),
We need to reverse it and change it to a dict like: {
'\uFC20': '\u0635\u062D',
'\uFCB1': '\u0635\u062D',
}
Now with this reversed ligatures dict, we can change char '\uFC20' to '\u0635\u062D' in text easily
Actually first element of declared ligature would be ignored
"""
original_text = 0
reshaped_text = 1

reversed_ligatures = {}

# title example: 'ARABIC LIGATURE SAD WITH HAH'
# ligature example: ('\u0635\u062D', ('\uFC20', '\uFCB1', '', ''))
for title, ligature in LIGATURES:
[reversed_ligatures.update({char: ligature[original_text]}) for
char in ligature[reshaped_text] if char]

return reversed_ligatures

def unreshape(self, text: str) -> str:
"""
This is for reshaping from a reshaped text to original one
It iterates over each char of text
First checks if that char exists in original chars list. If that's so,
it means that char has not changed during reshape process and is equal to it's original shape. So that
would be appended to result without change
Second, checks if char is in SPECIAL_LETTERS. these letters are those which have generated during reshaping
process, and need to get replaced with their two chars
Third, checks if char is in ligatures. If that's so, would replace with original text
And finally, if any of mentioned conditions didn't meet, would try to find original shape of char, in
reversed_letters
"""
text_list = list(text)
result = []
reversed_letters = self._reversed_letters()
reversed_ligatures = self._reversed_ligatures()

for index, char in enumerate(text_list):
# Checking if char_ is in original shape letters list and has not changed during reshape
if char in self.letters.keys():
result.append(char)
continue

# Checking if char_ is a special character
if char in SPECIAL_LETTERS.keys():
next_char = text_list[index+1]
next_next_char = text_list[index+2]

# Checking if two following chars are harakat. in that case, first harakat should be placed in middle
# of special chars
if HARAKAT_RE.match(next_char) and HARAKAT_RE.match(next_next_char):
result.append(next_char.join(list(SPECIAL_LETTERS.get(char))))

# Omitting first harakat
text_list[index+1] = ''

else:
result.append("".join(list(SPECIAL_LETTERS.get(char))))

continue

# Checking if char_ is in ligatures
if char in reversed_ligatures:
result.append(reversed_ligatures[char])
continue

# If couldn't find char is letters, append char itself
result.append(reversed_letters.get(char, char))

return "".join(result)


default_reshaper = ArabicReshaper()
reshape = default_reshaper.reshape
unreshape = default_reshaper.unreshape
12 changes: 12 additions & 0 deletions arabic_reshaper/letters.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,18 @@
ZWJ: (ZWJ, ZWJ, ZWJ, ZWJ),
}

# Special letters are like lam with alef with hamza. for reverse reshaping, we need this to reverse from reshaped
# lam with alef to regular one
SPECIAL_LETTERS = {
# lam alef
'\ufefc': ('\u0644', '\u0627'),
# lam with alef with hamza
'\ufef7': ('\u0644', '\u0623'),
# lam with alef with mad
'\ufef5': ('\u0644', '\u0622'),
}


def connects_with_letter_before(letter,LETTERS):
if letter not in LETTERS:
return False
Expand Down
169 changes: 169 additions & 0 deletions arabic_reshaper/tests/test_004_unreshaping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
from __future__ import unicode_literals
from __future__ import print_function

import unittest
import sys
import arabic_reshaper


def _unreshaping_test(test):
for i, case in enumerate(test.cases):
def t(): test.assertEqual(case[1], test.reshaper.unreshape(case[0]))
if hasattr(test, 'subTest'):
with test.subTest(i=i, case=case[0]):
t()
else:
print('running test case %d' % i, file=sys.stderr)
t()


class TestDefaultUnreshaping(unittest.TestCase):
def setUp(self):
self.reshaper = arabic_reshaper.default_reshaper
self.cases = (
# Reshaped text, Unreshaped text
('ﺍﻟﺴﻼﻡ ﻋﻠﻴﻜﻢ', 'السلام عليكم'),
('ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ', 'اللغة العربية هي أكثر اللغات'),
('ﺗﺤﺪﺛﺎ ﻭﻧﻄﻘﺎ ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ', 'تحدثا ونطقا ضمن مجموعة'),
('ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ', 'اللغات السامية'),
('ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ', 'العربية لغة رسمية في'),
('ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ', 'كل دول الوطن العربي'),
('ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ', 'إضافة إلى كونها لغة'),
('ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ', 'رسمية في تشاد وإريتريا'),
('ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ', 'وإسرائيل. وهي إحدى اللغات'),
('ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ', 'الرسمية الست في منظمة'),
('ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳﺤﺘﻔﻞ', 'الأمم المتحدة، ويحتفل'),
('ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ', 'باليوم العالمي للغة العربية'),
('ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ', 'في 18 ديسمبر كذكرى اعتماد'),
('ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ', 'العربية بين لغات العمل في'),
('ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.', 'الأمم المتحدة.'),
('ﺍﻵﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.', 'الآمم المتحدة.'),
('ﺍﺳﻢ ﻣﻦ amin ﺍﺳﺖ', 'اسم من amin است'),
)

def test_unreshaping(self):
_unreshaping_test(self)


class TestUnreshapingWithHarakat(unittest.TestCase):
def setUp(self):
self.reshaper = arabic_reshaper.ArabicReshaper({
'delete_harakat': False
})
self.cases = (
# Reshaped text, Unreshaped text
('ﺍﻟﺴَﻼَْﻡٌ ﻋَﻠَﻴْﻜُﻢْ', 'السَلَاْمٌ عَلَيْكُمْ'),
('ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ', 'اللغة العربية هي أكثر اللغات'),
('ﺗﺤﺪﺛﺎً ﻭﻧﻄﻘﺎً ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ', 'تحدثاً ونطقاً ضمن مجموعة'),
('ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ', 'اللغات السامية'),
('ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ', 'العربية لغة رسمية في'),
('ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ', 'كل دول الوطن العربي'),
('ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ', 'إضافة إلى كونها لغة'),
('ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ', 'رسمية في تشاد وإريتريا'),
('ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ', 'وإسرائيل. وهي إحدى اللغات'),
('ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ', 'الرسمية الست في منظمة'),
('ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳُﺤﺘﻔﻞ', 'الأمم المتحدة، ويُحتفل'),
('ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ', 'باليوم العالمي للغة العربية'),
('ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ', 'في 18 ديسمبر كذكرى اعتماد'),
('ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ', 'العربية بين لغات العمل في'),
('ﺍﻷﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.', 'الأمم المتحدة.'),
)

def test_unreshaping(self):
_unreshaping_test(self)


class TestUnreshapingWithHarakatWithoutLigatures(unittest.TestCase):
def setUp(self):
self.reshaper = arabic_reshaper.ArabicReshaper({
'delete_harakat': False,
'support_ligatures': False
})
self.cases = (
# Reshaped text, Unreshaped text
('ﺍﻟﺴَﻠَﺎْﻡٌ ﻋَﻠَﻴْﻜُﻢْ', 'السَلَاْمٌ عَلَيْكُمْ'),
('ﺍﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ ﻫﻲ ﺃﻛﺜﺮ ﺍﻟﻠﻐﺎﺕ', 'اللغة العربية هي أكثر اللغات'),
('ﺗﺤﺪﺛﺎً ﻭﻧﻄﻘﺎً ﺿﻤﻦ ﻣﺠﻤﻮﻋﺔ', 'تحدثاً ونطقاً ضمن مجموعة'),
('ﺍﻟﻠﻐﺎﺕ ﺍﻟﺴﺎﻣﻴﺔ', 'اللغات السامية'),
('ﺍﻟﻌﺮﺑﻴﺔ ﻟﻐﺔ ﺭﺳﻤﻴﺔ ﻓﻲ', 'العربية لغة رسمية في'),
('ﻛﻞ ﺩﻭﻝ ﺍﻟﻮﻃﻦ ﺍﻟﻌﺮﺑﻲ', 'كل دول الوطن العربي'),
('ﺇﺿﺎﻓﺔ ﺇﻟﻰ ﻛﻮﻧﻬﺎ ﻟﻐﺔ', 'إضافة إلى كونها لغة'),
('ﺭﺳﻤﻴﺔ ﻓﻲ ﺗﺸﺎﺩ ﻭﺇﺭﻳﺘﺮﻳﺎ', 'رسمية في تشاد وإريتريا'),
('ﻭﺇﺳﺮﺍﺋﻴﻞ. ﻭﻫﻲ ﺇﺣﺪﻯ ﺍﻟﻠﻐﺎﺕ', 'وإسرائيل. وهي إحدى اللغات'),
('ﺍﻟﺮﺳﻤﻴﺔ ﺍﻟﺴﺖ ﻓﻲ ﻣﻨﻈﻤﺔ', 'الرسمية الست في منظمة'),
('ﺍﻟﺄﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ، ﻭﻳُﺤﺘﻔﻞ', 'الأمم المتحدة، ويُحتفل'),
('ﺑﺎﻟﻴﻮﻡ ﺍﻟﻌﺎﻟﻤﻲ ﻟﻠﻐﺔ ﺍﻟﻌﺮﺑﻴﺔ', 'باليوم العالمي للغة العربية'),
('ﻓﻲ 18 ﺩﻳﺴﻤﺒﺮ ﻛﺬﻛﺮﻯ ﺍﻋﺘﻤﺎﺩ', 'في 18 ديسمبر كذكرى اعتماد'),
('ﺍﻟﻌﺮﺑﻴﺔ ﺑﻴﻦ ﻟﻐﺎﺕ ﺍﻟﻌﻤﻞ ﻓﻲ', 'العربية بين لغات العمل في'),
('ﺍﻟﺄﻣﻢ ﺍﻟﻤﺘﺤﺪﺓ.', 'الأمم المتحدة.'),
)

def test_unreshaping(self):
_unreshaping_test(self)


class TestUnreshapingLigatures(unittest.TestCase):
def setUp(self):
self.reshaper = arabic_reshaper.ArabicReshaper({
'delete_tatweel': True,
'ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM': True,
'ARABIC LIGATURE JALLAJALALOUHOU': True,
'ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM': True,
'ARABIC LIGATURE ALLAH ': True,
'ARABIC LIGATURE AKBAR': True,
'ARABIC LIGATURE ALAYHE': True,
'ARABIC LIGATURE MOHAMMAD': True,
'ARABIC LIGATURE RASOUL': True,
'ARABIC LIGATURE SALAM': True,
'ARABIC LIGATURE SALLA': True,
'ARABIC LIGATURE WASALLAM': True,
})
self.cases = (
# Reshaped text, Unreshaped text
(
'ﺇﻧﻪ ﻣﻦ ﺳﻠﻴﻤﺎﻥ ﻭﺇﻧﻪ ﷽ ﴿٣٠﴾ '
'ﺃﻻ ﺗﻌﻠﻮﺍ ﻋﻠﻲ ﻭﺃﺗﻮﻧﻲ ﻣﺴﻠﻤﻴﻦ ﴿٣١﴾',

'إنه من سليمان وإنه بسم الله الرحمن الرحيم ﴿٣٠﴾ '
'ألا تعلوا علي وأتوني مسلمين ﴿٣١﴾'
),
(
'ﻓﺬﻛﺮ ﺇﻧﻤﺎ ﺃﻧﺖ'
' ﻣﺬﻛﺮ ﴿٢١﴾ ﻟﺴﺖ'
' ﻋﻠﻴﻬﻢ ﺑﻤﺼﻴﻄﺮ ﴿٢٢﴾'
' ﺇﻻ ﻣﻦ ﺗﻮﻟﻰ'
' ﻭﻛﻔﺮ ﴿٢٣﴾ ﻓﻴﻌﺬﺑﻪ'
' ﷲ ﺍﻟﻌﺬﺍﺏ'
' ﺍﻷﻛﺒﺮ ﴿٢٤﴾',

'فذكر إنما أنت'
' مذكر ﴿٢١﴾ لست'
' عليهم بمصيطر ﴿٢٢﴾'
' إلا من تولى'
' وكفر ﴿٢٣﴾ فيعذبه'
' الله العذاب'
' الأكبر ﴿٢٤﴾',
),

(
'ﷴ ﷶ ﷲ ﷺ',
'محمد رسول الله صلى الله عليه وسلم',
),

(
'ﷲ ﷻ',
'الله جل جلاله',
),

(
'ﷴ ﷶ ﷲ ﷷ ﷹ ﷲ ﷸ',
'محمد رسول الله عليه صلى الله وسلم',
),
)

def test_unreshaping(self):
_unreshaping_test(self)


if __name__ == '__main__':
unittest.main()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
" applications that don't support Arabic"),
long_description=long_description,
long_description_content_type="text/markdown",
version='2.1.3',
version='2.2.0',
platforms="ALL",
license="MIT",
packages=['arabic_reshaper'],
Expand Down