Skip to content

Commit

Permalink
generate lexc from unimorph
Browse files Browse the repository at this point in the history
  • Loading branch information
flammie committed Mar 30, 2024
1 parent bfc4684 commit f4c3dfb
Showing 1 changed file with 254 additions and 0 deletions.
254 changes: 254 additions & 0 deletions scripts/unimorph/generate-lexcies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
#!/usr/bin/env python3
'''CLI program to turn unimorph data to GiellaLT morph tester yaml.'''
import sys


def main():
print('Multichar_Symbols')
print('+N +A +V')
print('+Sg +Pl +Du')
print('+Nom +Acc +Dat +Gen +Loc +Ine +Ill +Abl +Lat +Ela')
print('+Com +Abe +Tra +Ins +Ess')
print('+Prs +Prt +Ind +Pot +Cond +Imprt')
print('+Sg1 +Sg2 +Sg3 +Du1 +Du2 +Du3 +Pl1 +Pl2 +Pl3')
lemmas = 0
tokens = 0
suspicious = 0
prevlemma = None
for line in sys.stdin:
if not line or line.strip() == '':
print()
continue
# gravitáció gravitáción N;ON+ESS;SG
fields = line.strip().split('\t')
tokens += 1
if len(fields) != 3:
print('Datoissa virhe!', fields)
sys.exit(1)
elif fields[0] == '----' and fields[1] == '----' and\
fields[2] == '----':
# this is the kind bs that unimorph is full of
suspicious += 1
continue
lemma = fields[0]
surf = fields[1]
unimorphs = fields[2]
if lemma != prevlemma:
prevlemma = lemma
print()
lemmas += 1
if 'intransitive verb' in surf:
suspicious += 1
elif 'subjunctive forms' in surf:
suspicious += 1
elif '|' in surf:
suspicious += 1
giellatags = list()
for unimorph in unimorphs.split(';'):
if unimorph == 'N':
giellatags += ['+N']
elif unimorph == 'V':
giellatags += ['+V']
elif unimorph == 'ADJ':
giellatags += ['+A']
elif unimorph == 'NEUT':
giellatags += ['+Neu']
elif unimorph == 'MASC':
giellatags += ['+Msc']
elif unimorph == 'FEM':
giellatags += ['+Fem']
elif unimorph == 'MASC+FEM':
giellatags += ['+Common']
elif unimorph == 'GEN':
giellatags += ['+Gen']
elif unimorph == 'COM':
giellatags += ['+Com']
elif unimorph == 'ON+ESS':
giellatags += ['+Ses']
elif unimorph == 'FRML':
giellatags += ['+Ess']
elif unimorph == 'ESS':
giellatags += ['+Ess']
elif unimorph == 'INAN':
giellatags += ['+Inan']
elif unimorph == 'ANIM':
giellatags += ['+Anim']
elif unimorph == 'PRIV':
giellatags += ['+Abe']
elif unimorph == 'PRT':
giellatags += ['+Par']
elif unimorph == 'INS':
giellatags += ['+Ins']
elif unimorph == 'IN+ESS':
giellatags += ['+Ine']
elif unimorph == 'NOM':
giellatags += ['+Nom']
elif unimorph == 'ON+ALL':
giellatags += ['+Sub']
elif unimorph == 'AT+ALL':
giellatags += ['+All']
elif unimorph == 'PRP':
giellatags += ['+Loc']
elif unimorph == 'INST':
giellatags += ['+Inst']
elif unimorph == 'TRANS':
giellatags += ['+Tra']
elif unimorph == 'TERM':
giellatags += ['+Term']
elif unimorph == 'ON+ABL':
giellatags += ['+Del']
elif unimorph == 'IN+ABL':
giellatags += ['+Ela']
elif unimorph == 'IN+ALL':
giellatags += ['+Ill']
elif unimorph == 'DAT':
giellatags += ['+Dat']
elif unimorph == 'ACC':
giellatags += ['+Acc']
elif unimorph == 'AT+ESS':
giellatags += ['+Ade']
elif unimorph == 'AT+ABL':
giellatags += ['+Abl']
elif unimorph == 'SG':
giellatags += ['+Sg']
elif unimorph == 'DU':
giellatags += ['+Du']
elif unimorph == 'PL':
giellatags += ['+Pl']
elif unimorph == 'SG+PL':
# giellatags += ['+Sg/Pl']
pass
elif unimorph == 'IND':
giellatags += ['+Ind']
elif unimorph == 'PRS':
giellatags += ['+Prs']
elif unimorph == 'PST':
giellatags += ['+Prt']
elif unimorph == 'PRF':
giellatags += ['+Perf']
elif unimorph == 'FUT':
giellatags += ['+Fut']
elif unimorph == '1':
giellatags += ['+1']
elif unimorph == '2':
giellatags += ['+2']
elif unimorph == '3':
giellatags += ['+3']
elif unimorph == 'INDF':
pass # unmarked in giellatags
elif unimorph == 'GEADJ':
giellatags += ['+Gen']
suspicious += 1
elif unimorph == 'DEF':
giellatags += ['+Def']
elif unimorph == 'NDEF':
giellatags += ['+Ind']
elif unimorph == 'V.PTCP':
giellatags += ['+V']
if 'PRS' in unimorphs:
giellatags += ['+PrsPrc']
elif 'PST' in unimorphs:
giellatags += ['+PrtPrc']
elif 'FUT' in unimorphs:
giellatags += ['+Fut']
else:
giellatags += ['+Drv/Ptcp']
elif unimorph == 'NFIN':
giellatags += '+Ger'
elif unimorph == 'ACT':
giellatags += ['+Actv']
elif unimorph == 'PASS':
giellatags += ['+Pasv']
elif unimorph == 'COND':
giellatags += ['+Cond']
elif unimorph == 'POT':
giellatags += ['+Pot']
elif unimorph == 'IMP':
giellatags += ['+Imprt']
elif unimorph == 'SBJV':
giellatags += ['+Subj']
elif unimorph == 'V.CVB':
giellatags += ['+V']
giellatags += ['+Der/Adv']
elif unimorph == 'CMPR':
giellatags += ['+Comp']
elif unimorph == 'SPRL':
giellatags += ['+Sup']
elif unimorph == 'NEG':
giellatags += ['+Neg']
elif unimorph == 'POS':
# giellatags += ['+Pos']
pass
elif unimorph == 'LGSPEC':
pass
elif unimorph == 'LGSPEC1':
pass
else:
print('missing unimorph mapping for:', unimorph)
sys.exit(2)
reorg = list()
for ape in giellatags:
if ape in ['+N', '+V', '+A']:
reorg += [ape]
break
if reorg == ['+N']:
for ape in giellatags:
if ape in ['+Sg', '+Pl', '+Du']:
reorg += [ape]
for ape in giellatags:
if ape not in reorg:
reorg += [ape]
elif reorg == ['+V']:
for ape in giellatags:
if ape not in reorg:
reorg += [ape]
if '+1' in reorg and '+Sg' in reorg:
reorg += ['+Sg1']
reorg.remove('+1')
reorg.remove('+Sg')
elif '+2' in reorg and '+Sg' in reorg:
reorg += ['+Sg2']
reorg.remove('+2')
reorg.remove('+Sg')
elif '+3' in reorg and '+Sg' in reorg:
reorg += ['+Sg3']
reorg.remove('+3')
reorg.remove('+Sg')
elif '+1' in reorg and '+Du' in reorg:
reorg += ['+Du1']
reorg.remove('+1')
reorg.remove('+Du')
elif '+2' in reorg and '+Du' in reorg:
reorg += ['+Du2']
reorg.remove('+2')
reorg.remove('+Du')
elif '+3' in reorg and '+Du' in reorg:
reorg += ['+Du3']
reorg.remove('+3')
reorg.remove('+Du')
elif '+1' in reorg and '+Pl' in reorg:
reorg += ['+Pl1']
reorg.remove('+1')
reorg.remove('+Pl')
elif '+2' in reorg and '+Pl' in reorg:
reorg += ['+Pl2']
reorg.remove('+2')
reorg.remove('+Pl')
elif '+3' in reorg and '+Pl' in reorg:
reorg += ['+Pl3']
reorg.remove('+3')
reorg.remove('+Pl')
elif reorg == ['+A']:
for ape in giellatags:
if ape not in reorg:
reorg += [ape]
else:
print('REORG FAIL', reorg)
sys.exit(1)
giellatags = reorg
print(lemma, ''.join(giellatags), ':', surf, sep='')



if __name__ == '__main__':
main()

0 comments on commit f4c3dfb

Please sign in to comment.