Skip to content

Commit

Permalink
Migrate to new pynini interface (in progress)
Browse files Browse the repository at this point in the history
  • Loading branch information
wrznr committed Jul 23, 2021
1 parent b7fac1d commit 834701f
Show file tree
Hide file tree
Showing 7 changed files with 345 additions and 356 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
click
pytest
pytest-ordering
http://www.opengrm.org/twiki/pub/GRM/PyniniDownload/pynini-2.0.5.tar.gz#egg=pynini
Cython
http://www.opengrm.org/twiki/pub/GRM/PyniniDownload/pynini-2.1.3.tar.gz#egg=pynini
2 changes: 1 addition & 1 deletion timur/fsts/deko_fst.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def __construct_del_ge(self):
Case-dependent deletion of the ge marker
'''

# delete <ge> at certain suffixes like 'ver'
# delete <ge> at certain prefixes like 'ver'
return pynini.concat(
pynini.transducer("<no-ge>", "", input_token_type=self.__syms.alphabet),
pynini.concat(
Expand Down
412 changes: 194 additions & 218 deletions timur/fsts/map_fst.py

Large diffs are not rendered by default.

236 changes: 123 additions & 113 deletions timur/fsts/sublexica.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,48 +12,50 @@ class Sublexica:

def __init__(self, syms, lexicon):

#
# store alphabet
self.__syms = syms

#
# store lexicon
self.__lex = lexicon


#
# (private) helpers
self.__sigma_star = pynini.union(
syms.characters,
syms.categories,
syms.stem_types,
syms.stem_type_features,
syms.origin_features,
syms.circumfix_features,
syms.inflection_classes,
syms.geo_inflection_classes,
pynini.acceptor("<ge>", token_type=syms.alphabet) # for word-internal <ge> (ausgewertet)
).closure().optimize()

#
# NoDef2NULL
self.__nodef_to_null = pynini.union(
self.__sigma_star,
syms.origin_features,
pynini.transducer("<NoDef>", "", input_token_type=self.__syms.alphabet),
syms.stem_types
).closure().optimize()

#
# sublexica
self.__bdk_stems = self.__construct_bdk_stems()
self.__base_stems = self.__construct_base_stems()
self.__pref_stems = self.__construct_pref_stems()
self.__verbal_pref_stems = self.__construct_verbal_pref_stems()
self.__simplex_suff_stems = self.__construct_simplex_suff_stems()
self.__suff_deriv_suff_stems = self.__construct_suff_deriv_suff_stems()
self.__pref_deriv_suff_stems = self.__construct_pref_deriv_suff_stems()
self.__quant_suff_stems = self.__construct_quant_suff_stems()
with pynini.default_token_type(syms.alphabet):

#
# store alphabet
self.__syms = syms

#
# store lexicon
self.__lex = lexicon


#
# (private) helpers
self.__sigma_star = pynini.union(
syms.characters,
syms.categories,
syms.stem_types,
syms.stem_type_features,
syms.origin_features,
syms.circumfix_features,
syms.inflection_classes,
syms.geo_inflection_classes,
pynini.accep("<ge>") # for word-internal <ge> (ausgewertet)
).closure().optimize()

#
# NoDef2NULL
self.__nodef_to_null = pynini.union(
self.__sigma_star,
syms.origin_features,
pynini.cross("<NoDef>", ""),
syms.stem_types
).closure().optimize()

#
# sublexica
self.__bdk_stems = self.__construct_bdk_stems()
self.__base_stems = self.__construct_base_stems()
self.__pref_stems = self.__construct_pref_stems()
self.__verbal_pref_stems = self.__construct_verbal_pref_stems()
self.__simplex_suff_stems = self.__construct_simplex_suff_stems()
self.__suff_deriv_suff_stems = self.__construct_suff_deriv_suff_stems()
self.__pref_deriv_suff_stems = self.__construct_pref_deriv_suff_stems()
self.__quant_suff_stems = self.__construct_quant_suff_stems()

@property
def nodef_to_null(self):
Expand Down Expand Up @@ -122,109 +124,117 @@ def __construct_bdk_stems(self):
'''
Base, derivation and compound stems (without derivation suffixes)
'''
return pynini.compose(
self.__lex,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.string_map(["<Base_Stems>", "<Deriv_Stems>", "<Kompos_Stems>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(),
self.__sigma_star
)
).optimize()
with pynini.default_token_type(self.__syms.alphabet):
return pynini.compose(
self.__lex,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.string_map(["<Base_Stems>", "<Deriv_Stems>", "<Kompos_Stems>"]).project("input"),
self.__sigma_star
)
).optimize()

def __construct_base_stems(self):
'''
Base stems
'''
return pynini.compose(
self.__bdk_stems,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet),
self.__sigma_star
)
).optimize()
with pynini.default_token_type(self.__syms.alphabet):
return pynini.compose(
self.__bdk_stems,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.accep("<Base_Stems>"),
self.__sigma_star
)
).optimize()

def __construct_pref_stems(self):
'''
Prefix stems
'''
return pynini.compose(
self.__lex,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet),
self.__sigma_star
)
).optimize()
with pynini.default_token_type(self.__syms.alphabet):
return pynini.compose(
self.__lex,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.accep("<Pref_Stems>"),
self.__sigma_star
)
).optimize()

def __construct_verbal_pref_stems(self):
'''
Verbal prefix stems
'''
return pynini.compose(
self.__pref_stems,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet),
self.__sigma_star,
pynini.acceptor("<V>", token_type=self.__syms.alphabet),
self.__sigma_star
)
).optimize()
with pynini.default_token_type(self.__syms.alphabet):
return pynini.compose(
self.__pref_stems,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.accep("<Pref_Stems>"),
self.__sigma_star,
pynini.accep("<V>", token_type=self.__syms.alphabet),
self.__sigma_star
)
).optimize()

def __construct_simplex_suff_stems(self):
'''
Derivation suffixes which combine with simplex stems
'''
return pynini.compose(
self.__lex,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet),
pynini.transducer("<simplex>", "", input_token_type=self.__syms.alphabet),
self.__sigma_star
)
).optimize()
with pynini.default_token_type(self.__syms.alphabet):
return pynini.compose(
self.__lex,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.accep("<Suff_Stems>"),
pynini.cross("<simplex>", ""),
self.__sigma_star
)
).optimize()

def __construct_suff_deriv_suff_stems(self):
'''
Derivation suffixes which combine with suffixed stems
'''
return pynini.compose(
self.__lex,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet),
pynini.transducer("<suffderiv>", "", input_token_type=self.__syms.alphabet),
self.__sigma_star
)
).optimize()
with pynini.default_token_type(self.__syms.alphabet):
return pynini.compose(
self.__lex,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.accep("<Suff_Stems>"),
pynini.cross("<suffderiv>", ""),
self.__sigma_star
)
).optimize()

def __construct_pref_deriv_suff_stems(self):
'''
Derivation suffixes which combine with prefixed stems
'''
return pynini.compose(
self.__lex,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet),
pynini.transducer("<prefderiv>", "", input_token_type=self.__syms.alphabet),
self.__sigma_star
)
).optimize()
with pynini.default_token_type(self.__syms.alphabet):
return pynini.compose(
self.__lex,
pynini.concat(
self.__syms.initial_features.closure(),
pynini.accep("<Suff_Stems>"),
pynini.cross("<prefderiv>", ""),
self.__sigma_star
)
).optimize()

def __construct_quant_suff_stems(self):
'''
Derivation suffixes which combine with a number and a simplex stem
'''
return pynini.compose(
self.__lex,
pynini.concat(
pynini.transducer("<QUANT>", "", input_token_type=self.__syms.alphabet),
self.__syms.initial_features.closure(),
pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet),
pynini.transducer("<simplex>", "", input_token_type=self.__syms.alphabet),
self.__sigma_star
)
).optimize()
with pynini.default_token_type(self.__syms.alphabet):
return pynini.compose(
self.__lex,
pynini.concat(
pynini.cross("<QUANT>", ""),
self.__syms.initial_features.closure(),
pynini.accep("<Suff_Stems>"),
pynini.cross("<simplex>", ""),
self.__sigma_star
)
).optimize()
2 changes: 1 addition & 1 deletion timur/fsts/timur_fst.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def build(self, lexicon_stream):
mappings = fsts.MapFst(self.__syms)

# delete certain symbols on the upper and lower level
lex = mappings.map1 * lex * mappings.map2
lex = mappings.map1 @ lex @ mappings.map2
lex.draw("lex_map.dot", portrait=True)

#
Expand Down
6 changes: 4 additions & 2 deletions timur/helpers/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ def load_lexicon(source, symbol_table):
tmp.set_final(start)
for token in tokenizer.findall(line):
if token[1]:
tmp = pynini.concat(tmp, pynini.transducer(token[0], token[1], input_token_type=symbol_table, output_token_type=symbol_table))
tmp1 = pynini.concat(tmp, pynini.accep(token[0], token_type=symbol_table))
tmp2 = pynini.concat(tmp, pynini.accep(token[1], token_type=symbol_table))
tmp = pynini.concat(tmp, pynini.cross(tmp1, tmp2))
else:
tmp = pynini.concat(tmp, pynini.acceptor(token[0], token_type=symbol_table))
tmp = pynini.concat(tmp, pynini.accep(token[0], token_type=symbol_table))
lex = pynini.union(lex, tmp)
return lex

Expand Down
Loading

0 comments on commit 834701f

Please sign in to comment.