-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathphonotactics.py
305 lines (185 loc) · 15.5 KB
/
phonotactics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#==============================================================================================
# The Jabalín morphological generator for Arabic verbs
#
# Copyright (c) 2012 Susana López Hervás, Alicia González Martínez, Antonio Moreno Sandoval
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>
#==============================================================================================
###################################################################
# MODULE PHONOTACTIC CONSTRAINTS AND ORTHOGRAPHIC NORMALIZATION #
###################################################################
import re
import sys
# This function sukunizes the form and applies long vowels normalization.
def Sukun_and_Long_vowels_normalization(form):
# sukunizer
form=re.sub(r"([ءأئؤبتثجحخدذرزسشصضطظعغفقكلمنوهي])((?![ًٌٍَُِّْ])|(?=$))", r"\1ْ", form)
# C1 sukunizer
# long vowels normalization
form = form.replace('َE', 'َا') # C2 aE -> aE
form = form.replace('ُE', 'ُو') # C3 uE -> uw
form = form.replace('ِE', 'ِي') # C4 iE -> iy
return(form)
# initial consonant cluster constrints rules - preprocessing initial alif
def preprocessing_Initial_Alif(form):
C = r'[بتثجخحدذرزسشصضطظعغفقكلمنهويأءؤئ]'
form=re.sub(r'^({0}ْ{1}ّ?ُ)'.format(C,C), r'اُ\1',form) # D1 ∅ -> Au / ^_C·C~?u
form=re.sub(r'^({0}ْ{1}ّ?[^ُ])'.format(C,C), r'اِ\1',form) # D2 ∅ -> Ai / ^_C·C~?[^u]
return form
def Apply_WeakLettersRules(form, al_code, lema, root, d_code):
# Apocopated imperative (rare real irreg):
# if simple verb, imperative form, first radical waw; and if (thematic vowel of imperfective kasra) or
# ((second or third radical is a guttural خ غ ح ع ء ه or semiguttural ر) and (thematic verb is fatha)):
# then the fist radical is removed from the form (plus prosthetic alif)
Gutturals = ['خ','غ','ح','ع','ء','ه','ر']
if d_code['Internal derivation']['lengthening']=='0' and d_code['Internal derivation']['addition']=='0'\
and d_code['Template']=='L' and al_code[:4]=='VIAM' and root[0]=='و'\
and ((d_code['Vocalization']['Imperf V2']=='0') or ((d_code['Vocalization']['Imperf V2']=='2')\
and (root[1] in Gutturals or root[2] in Gutturals))):
form=form[4:] # E1
form=re.sub('^اِوْ','اِيْ',form) # E2 y -> w / ^Ai_· (iw constraint for imperatives)
if (d_code['Internal derivation']['addition']!='3')and (re.search('^[وي]',root))and(len(root)!= 4):
form=re.sub('^([ء-ي]َ)وْ([ء-ي]ِ)',r'\1\2',form) # E3 w· -> ∅ / ^Ca_Ci
form=re.sub('(.[ء-ي]ّ?ِ)[وي]ْ(?!$)',r'\1ي',form) # E4 [wy]· -> y / [^^]C~?i_[^$]
form=re.sub('([ء-ي]ّ?[َ-ِ])[وي]ْ$',r'\1',form) # E5 [wy]· -> ∅ / C~?v_$
if d_code['Internal derivation']['lengthening']!='1' and not re.match('[12]',d_code['Internal derivation']['addition'])\
and len(root)!=4 and re.search('[^وي][^وي].$', lema):
form=re.sub('([ء-ي])ْ[وي]([َ-ِ])(?=[ء-ي]ْ)',r'\1\2',form) # E6 ·[wy] -> ∅ / C_vC·
form=re.sub('([ء-ي])ْ[وي]َ(?!ي)(.[َ-ِ])',r'\1َا\2',form) # E7 ·[wy] -> A / C_a[^y]
form=re.sub('(?<=[ء-ي])ْ[وي]ِ([^ي][َ-ِ])',r'ِي\1',form) # E8 ·[wy]i -> iy / C_[^y]v
form=re.sub('([ء-ي])ْ[وي](ُ)([^ي][َ-ِ])',r'\1\2و\3',form) # E9 ·[wy]u -> uw / C_[^y]v
form=re.sub('^([ء-ي]َ)[وي][َ-ِ]([^ي][َ-ِ])',r'\1ا\2',form) # E10 [wy]v -> A / ^Ca_[^y]v
form=re.sub('(.[ء-ي]ّ?َ)[وي]َ([ء-ي]َ)',r'\1\2',form) # E11 [wy]a -> ∅ / .C~?a_Ca
form=re.sub('ُ[وي]ِ(.[َ-ِ])',r'ِي\1',form) # E12 u[wy]i -> iy / _Cv
if re.match('[124]',d_code['Internal derivation']['addition']) and re.search('[^وي].$',lema) and re.search('[^وي]$',root):
form=re.sub('([ء-ي]َ)[وي][َ-ِ]([^ي][َ-ِ])',r'\1ا\2',form) # E13 [wy]v -> A / Ca_[^y]v (adaptada para VII-VIII-X)
if (d_code['Vocalization']['Imperf V2']=='1')and(re.search('[وي](?!و)(?!ي).$',root)):
form=re.sub('([ء-ي])َوَ([ء-ي]ْ)',r'\1ُ\2',form) # E14 awa -> u / C_C·
if (d_code['Vocalization']['Imperf V2']!='1') and len(root)==3 and not\
re.match('[12]',d_code['Internal derivation']['addition']):
form=re.sub('([ء-ي])َ[وي][َِ]([^ي]ْ.)',r'\1ِ\2',form) # E15 a[wy][ai] -> i / C_[^y]·.
if (re.search('^.[^وي][وي]$', root) or re.search('^.[وي][^وي]$', root)):
form=re.sub('(َ)[وي][َِ]([ء-ي]ْ)',r'\1\2',form) # E16 [wy][ai] -> ∅ / a_C·
form=re.sub('[ُِ][وي](ُو)',r'\1',form) # E17 [ui][wy] -> ∅ / _uW
if d_code['Vocalization']['Perf V2']!='2' or (root[1]!='و' and root[1:]!='يي'):
form=re.sub('ُ[وي](ِ.ْ)',r'\1',form) # E18 u[wy] -> ∅ / C_iC·
if (((d_code['Vocalization']['Perf V2']!='1')|(d_code['Vocalization']['Imperf V2']!='1')) and (al_code[2]!='P')):
form=re.sub('([ء-ي]ّ?)[ُِ][وي](ِي)',r'\1\2',form) # E19 [ui][wy] -> ∅ / C~?_iy
form=re.sub('(َ)[وي]ُو',r'\1وْ',form) # E20 [wy]uw -> w· / a_
if (d_code['Vocalization']['Imperf V2']!='1') or al_code[2]=='P':
form=re.sub('([^ي]َ)[وي][َُ]$',r'\1ى',form) # E21 [wy][au] -> Y / [^y]a_$ (pasiva & imperf en u sí cambia a alif maqsura)
if (d_code['Vocalization']['Imperf V2']=='1'):
form=re.sub('(َ)وَ$',r'\1ا',form) # E22 wa -> A / a_$
form=re.sub('(يَ)يَ$',r'\1ا',form) # E23 ya -> A / ya_$
form=re.sub('(ُو)ُ$',r'\1',form) # E24 u -> ∅ / uw_$
form=re.sub('ِوَ',r'ِيَ',form) # E25 w -> y / i_a
if re.match('VPAN3[SD]F',al_code):
form=re.sub('([ء-ي]َ)[وي]َ([ء-ي][َْ]ا?)$',r'\1\2',form) # E26 [wy]a -> ∅ / Ca_C[·a]A?$
form=re.sub('(ُو|ِي)ْ',r'\1',form) # E27 · -> ∅ / (uw|iy)_
form=re.sub('(ِ)[وي]ُ$',r'\1ي',form) # E28 [wy]u -> y / i_$
if re.search('^..و$',root) and\
((d_code['Internal derivation']['lengthening']!='0' or d_code['Internal derivation']['addition']!='0')\
and not re.match('V[IP][AP][NSY]3PM',al_code) and not re.match('VI[AP][NSYM]2PM',al_code))\
or ((d_code['Internal derivation']['lengthening']=='0' and d_code['Internal derivation']['addition']=='0')\
and (d_code['Vocalization']['Perf V2']=='0' and d_code['Vocalization']['Imperf V2']=='1')\
and (re.match('VIP[NSY][23]D[FMN]',al_code) or re.match('VIP[NSY][23]PF',al_code) or re.match('VIPN2SF',al_code))):
form=re.sub('([ء-ي]ّ?َ)و',r'\1ي',form) # E29 w -> y / C~?a_
form=re.sub('َيِي','َيْ',form) # E30 yiy -> y· / a_
return(form)
def Apply_ShaddaRules(form, al_code, lema, root, d_code):
# this rule suggest the need of a syllabic frontier mark
if (d_code['Internal derivation']['lengthening']=='1')\
or (\
len(root)==3 and (not re.match(r'^.([يو])\1$',root))\
and ((not 'ت' in root)\
or (re.search('.ت[َ-ِ]ت[َ-ِ]ت[َ-ِ]',form))\
or (root[0]=='ت' and (d_code['External derivation']!='1')\
and not (d_code['Internal derivation']['addition']=='4'\
and root[:2]=='تو'))\
or (re.match('^.تت$',root) and not re.match('VP[AP]N3DF',al_code) and d_code['Internal derivation']['addition']!='2')\
)\
):
form=re.sub(r'([^ْ])([ء-ي])[َ-ِ]\2([َ-ِ])',r'\1\2ّ\3',form) # F1 C¹v¹C¹ -> C¹~ / [^·]_v² (and not beginning)
form=re.sub(r'([ء-ي])ْ([ء-ي])([َ-ِ])\2([َ-ِ])',r'\1\3\2ّ\4',form) # F2 ·C²v¹C² -> v¹C²~ / C¹_v²
if root[1]==root[2] and not re.match('[YM]',al_code[3]):
form=re.sub(r'ْ([ء-ي])([َ-ِ])\1ْ$',r'\2\1َّ',form) # F3 ·C¹v¹C¹· -> v¹C¹~a / _$
# ORTHOGRAPHIC
form=re.sub(r'([ء-ي])ْ\1([َ-ِ])',r'\1ّ\2',form) # F4 ·C¹ -> ~ / C¹_v¹ (C¹·C¹v¹ -> C¹~v¹)
return(form)
# initial consonant cluster constrints rule - posprocessing initial
def posprocessing_Initial_Alif(form):
C = r'[بتثجخحدذرزسشصضطظعغفقكلمنهويأءؤئ]'
form=re.sub(r'^ا[َُِ]({0}[َُِ])'.format(C), r'\1',form) # D3 Av -> ∅ / ^_Cv
return form
def Apply_HamzaRules(form):
try:
form_trasl=form
form_trasl=re.sub('^[ءؤئأ]ِ','إِ',form_trasl) # G1 [cúýÁ] -> À / ^_i eg إِبدَاء
form_trasl=re.sub('^[ءؤئإ](?=[َُ])','أ',form_trasl) # G2 [cúýÀ] -> Á / ^_[au] eg أُسرَةُ
form_trasl=re.sub('(^|[^اُِي])[ءأإؤئ]َا',r'\1آ',form_trasl) # G3 [cÁÀúý]aA -> Ã / (^|[^Auiy])_ eg آسِف / قُرآن
form_trasl=re.sub('[ءأإؤئ]َ[ءأإؤئ]ْ','آ',form_trasl) # G4 [cÁÀúý]a[cÁÀúý]· -> Ã eg آتَكِلُ
form_trasl=re.sub('ْ[ءإؤئ]َ','ْأَ',form_trasl) # G5 [cÀúý] -> Á / ·_a eg أَبأَرُ
form_trasl=re.sub('ْ[ءأإئ]ُ','ْؤُ',form_trasl) # G6 [cÁÀý] -> ú / ·_u eg اُبؤُس
form_trasl=re.sub('ي[أإؤئ]([ًٌٍَُِ]ا?)$',r'يء\1',form_trasl) # G8 [ÁÀúý] -> c / y_[aiuâîû]A?$ eg جُزَيءٌ
form_trasl=re.sub('ي[ءأإؤ](?!ًا)(?=..)','يئ',form_trasl) # G9 [cÁÀú] -> ý / y_(?!âA)(?=..) eg بَدِيئَة
form_trasl=re.sub('ا[أإؤئ]َ','اءَ',form_trasl) # G10 [ÁÀúý] -> c / A_a eg بَدَاءَة
form_trasl=re.sub('ُ[ءأإئ](?=[َُْ])','ُؤ',form_trasl) # G11 [cÁÀý] -> ú / u_[a·u] eg بَطُؤتُ
form_trasl=re.sub('و[ءأإئ](?=[َُ].)(?!ًا)','وؤ',form_trasl) # G12 [cÁÀý] -> ú / w_(?=[au].)(?!âA) eg مَوبُوؤَينِ
form_trasl=re.sub('و[أإؤئ]([ًٌَُ]ا?)$',r'وء\1',form_trasl) # G13 [ÁÀúý] -> c / w_[auâû]A?$ eg بُرُوءٌ
form_trasl=re.sub('َ[ءإؤئ](?=ّ?[َْ])','َأ',form_trasl) # G14 [cÀúý] -> Á / a_~?[a·] eg وَثَأتُ
form_trasl=re.sub('(?<=[َا])[ءأإئ]ُ(?=.)','ؤُ',form_trasl) # G15 [cÁÀý] -> ú / [aA]_u. eg يُوثَؤُوا
form_trasl=re.sub('َ[ءإؤئ]([ٌُ])$',r'َأ\1',form_trasl) # G16 [cÀúý] -> Á / a_[uû]$ eg وَثَأُ
form_trasl=re.sub('ا[أإؤئ]([ٌُ])$',r'اء\1',form_trasl) # G17 [ÁÀúý] -> c / A_[uû]$ eg وَثَاءٌ
form_trasl=re.sub('([^َْ])[ءأإؤ](ّ?ِ)$',r'\1ئ\2',form_trasl) # G18a [cÁÀú] -> ý / [^·a]_~?i$
form_trasl=re.sub('َ[ءأإؤ]ِ$',r'َئِ',form_trasl) # G18b [cÁÀú] -> ý / a_i$
form_trasl=re.sub('َ[ءئإؤ]ِّ$',r'َئِّ',form_trasl) # G18b [cýÀú] -> Á / a_~i$
form_trasl=re.sub('(?<=.)[ءأإؤ](?=ّ?ِ.)','ئ',form_trasl) # G19 [cÁÀú] -> ý / ._~?i[^$] eg تُنئِيَا
form_trasl=re.sub('ْ[ءأؤئ]ِ$','ْإِ',form_trasl) # G20 [cÁúý] -> À / ·_i$ eg أَصءِ
form_trasl=re.sub('ِ[ءأإؤ]','ِئ',form_trasl) # G7 [cÁÀú] -> ý / i_ eg جَآجِئُ
return(form_trasl)
except ValueError as verror:
print('Value error in function Apply_RulesPostTrasl_form(): ' + str(verror))
# This function receives as paremeters a list with a dictionary containing the verbal forms and CODE, lema, root, d_code
# and applyies the phonotactic and orthographic rules returning a list with the surface forms and their CODE.
def Irreg_rules_list(d_form, lema, root, d_code):
keys=d_form.keys()
d_norm_f={}
for k in keys:
# Sukun and long vowel rules
form = Sukun_and_Long_vowels_normalization(d_form[k])
# initial consonant clusters preprocessing
form = preprocessing_Initial_Alif(form)
# Weak letters
form = Apply_WeakLettersRules(form, k, lema, root, d_code)
# Shadda rules
form = Apply_ShaddaRules(form, k, lema, root, d_code)
# initial consonant clusters posprocessing
form = posprocessing_Initial_Alif(form)
# Hamza rules
form = Apply_HamzaRules(form)
d_norm_f.setdefault(k, form)
return d_norm_f
# This function receives as paremeters a dictionary with the verbal form, lema, root, d_code
# and applies the phonotactic and orthographic rules returning a dictionary with the surface forms
def phonotactic_rules(dict_form, lema, root, d_code):
dict_form['VP']= {'Active': Irreg_rules_list(dict_form['VP']['Active'], lema, root, d_code),
'Pasive': Irreg_rules_list(dict_form['VP']['Pasive'], lema, root, d_code)}
dict_form['VI']['Active']= {'Indicative': Irreg_rules_list(dict_form['VI']['Active']['Indicative'], lema, root, d_code),
'Subjunctive': Irreg_rules_list(dict_form['VI']['Active']['Subjunctive'], lema, root, d_code),
'Yusive': Irreg_rules_list(dict_form['VI']['Active']['Yusive'], lema, root, d_code)}
dict_form['VI']['Pasive']= {'Indicative': Irreg_rules_list(dict_form['VI']['Pasive']['Indicative'], lema, root, d_code),
'Subjunctive': Irreg_rules_list(dict_form['VI']['Pasive']['Subjunctive'], lema, root, d_code),
'Yusive': Irreg_rules_list(dict_form['VI']['Pasive']['Yusive'], lema, root, d_code)}
dict_form['VIAM'] = Irreg_rules_list(dict_form['VIAM'], lema, root, d_code)
return(dict_form)