diff --git a/requirements.txt b/requirements.txt index 703f684..9f5087d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ click pytest pytest-ordering -http://www.opengrm.org/twiki/pub/GRM/PyniniDownload/pynini-2.0.5.tar.gz#egg=pynini +Cython +http://www.opengrm.org/twiki/pub/GRM/PyniniDownload/pynini-2.1.3.tar.gz#egg=pynini diff --git a/timur/fsts/deko_fst.py b/timur/fsts/deko_fst.py index 3662efd..d5fff72 100644 --- a/timur/fsts/deko_fst.py +++ b/timur/fsts/deko_fst.py @@ -349,7 +349,7 @@ def __construct_del_ge(self): Case-dependent deletion of the ge marker ''' - # delete at certain suffixes like 'ver' + # delete at certain prefixes like 'ver' return pynini.concat( pynini.transducer("", "", input_token_type=self.__syms.alphabet), pynini.concat( diff --git a/timur/fsts/map_fst.py b/timur/fsts/map_fst.py index 0c1947b..4de63f3 100644 --- a/timur/fsts/map_fst.py +++ b/timur/fsts/map_fst.py @@ -11,236 +11,211 @@ class MapFst: ''' def __init__(self, syms): + + with pynini.default_token_type(syms.alphabet): + + # store alphabet + self.__syms = syms + + # delete initial features + del_initial_features = pynini.cross("", syms.initial_features) + + # delete categories + del_cat_ext = pynini.cross("", pynini.union(syms.categories, syms.disjunctive_categories)) + + # delete stem types + del_stem_types = pynini.cross("", syms.stem_types) + + # delete prefix/suffix marker + del_prefix_suffix_marker = pynini.cross("", syms.prefix_suffix_marker) + + # insert prefix/suffix marker + insert_prefix_suffix_marker = pynini.cross(syms.prefix_suffix_marker, "") + + # delete stem type features + del_stem_type_feats = pynini.cross("", syms.stem_type_features) + + # delete origin features + del_origin_feats = pynini.cross("", syms.origin_features) + + # delete complexity agreement features + del_complexity_agreement_feats = pynini.cross("", syms.complexity_agreement_features) + + # delete word complexity features + del_complex_lex_entries = pynini.cross("", syms.complexity_entry_features) + + # insert word complexity features + insert_complex_lex_entries = pynini.cross(syms.complexity_entry_features, "") + + # inflection classes + del_infl_classes = pynini.cross("", syms.inflection_classes) + + # disjunctive features + disjunctive_feat_list = ["", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", + "", "", "", "", + "", "", "", "", + "", "", "", "", + "","","", + "", "", + "", "", "", + "", "", ""] + disjunctive_feats = pynini.string_map(disjunctive_feat_list).project("input").optimize() + del_disjunctive_feats = pynini.cross("", disjunctive_feats) + + # short cut: map_helper1 + map_helper1 = pynini.union( + syms.characters, + pynini.accep(""), + pynini.accep(""), + pynini.cross("e", ""), + pynini.cross("n", ""), + pynini.cross("e", ""), + pynini.cross("d", ""), + pynini.cross("", "<~n>"), + pynini.cross("", "
    "), + del_stem_types, + syms.prefix_suffix_marker, + del_stem_type_feats, + pynini.cross("", ""), + del_origin_feats, + del_complexity_agreement_feats, + del_complex_lex_entries, + del_infl_classes, + del_disjunctive_feats, + ).closure().optimize() + + # short cut: map_helper2 + map_helper2 = pynini.concat( + map_helper1, + pynini.concat( + pynini.concat( + syms.characters, + pynini.union( + pynini.union( + syms.characters, + pynini.accep(""), + pynini.accep("") + ), + syms.categories + ).closure(), + ).closure(0, 1), + map_helper1 + ) + ).optimize() - # store alphabet - self.__syms = syms - - # delete initial features - del_initial_features = pynini.transducer("", syms.initial_features) - - # delete categories - del_cat_ext = pynini.transducer("", pynini.union(syms.categories, syms.disjunctive_categories)) - - # delete stem types - del_stem_types = pynini.transducer("", syms.stem_types) - - # delete prefix/suffix marker - del_prefix_suffix_marker = pynini.transducer("", syms.prefix_suffix_marker) - - # insert prefix/suffix marker - insert_prefix_suffix_marker = pynini.transducer(syms.prefix_suffix_marker, "") - - # delete stem type features - del_stem_type_feats = pynini.transducer("", syms.stem_type_features) - - # delete origin features - del_origin_feats = pynini.transducer("", syms.origin_features) - - # delete complexity agreement features - del_complexity_agreement_feats = pynini.transducer("", syms.complexity_agreement_features) - - # delete word complexity features - del_complex_lex_entries = pynini.transducer("", syms.complexity_entry_features) - - # insert word complexity features - insert_complex_lex_entries = pynini.transducer(syms.complexity_entry_features, "") - - # inflection classes - del_infl_classes = pynini.transducer("", syms.inflection_classes) - - # disjunctive features - disjunctive_feat_list = ["", "", "", "", - "", "", "", "", - "", "", "", "", - "", "", "", "", - "", "", "", "", - "", "", "", - "", "", "", "", - "", "", "", "", - "", "", "", "", - "","","", - "", "", - "", "", "", - "", "", ""] - disjunctive_feats = pynini.string_map(disjunctive_feat_list, input_token_type=syms.alphabet, output_token_type=syms.alphabet).project().optimize() - del_disjunctive_feats = pynini.transducer("", disjunctive_feats) - - # short cut: map_helper1 - map_helper1 = pynini.union( - syms.characters, - pynini.acceptor("", token_type=syms.alphabet), - pynini.acceptor("", token_type=syms.alphabet), - pynini.transducer("e", "", input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("n", "", input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("e", "", input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("d", "", input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", "<~n>", output_token_type=syms.alphabet), - pynini.transducer("", "
      ", output_token_type=syms.alphabet), - del_stem_types, - syms.prefix_suffix_marker, - del_stem_type_feats, - pynini.transducer("", "", output_token_type=syms.alphabet), - del_origin_feats, - del_complexity_agreement_feats, - del_complex_lex_entries, - del_infl_classes, - del_disjunctive_feats, - ).closure().optimize() - - # short cut: map_helper2 - map_helper2 = pynini.concat( - map_helper1, - pynini.concat( + # + self.__map1 = pynini.concat( + del_initial_features.closure(), pynini.concat( - syms.characters, pynini.union( - pynini.union( - syms.characters, - pynini.acceptor("", token_type=syms.alphabet), - pynini.acceptor("", token_type=syms.alphabet) + pynini.concat( + pynini.cross("", pynini.string_map(["", ""]).project("input")), + pynini.concat(map_helper2, del_cat_ext) ), - syms.categories - ).closure(), - ).closure(0, 1), - map_helper1 - ) - ).optimize() - - # - self.__map1 = pynini.concat( - del_initial_features.closure(), - pynini.concat( - pynini.union( - pynini.concat( - pynini.transducer("", pynini.string_map(["", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project()), - pynini.concat(map_helper2, del_cat_ext) - ), - pynini.concat( - pynini.transducer("", pynini.string_map(["", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project()), - pynini.concat(map_helper2, syms.categories) - ), - pynini.concat( - pynini.transducer("", "", output_token_type=syms.alphabet), - map_helper1, - del_cat_ext - ), - pynini.concat( - pynini.transducer("", "", output_token_type=syms.alphabet), - map_helper1, - del_cat_ext, - map_helper1, - syms.categories, - pynini.transducer("", "", output_token_type=syms.alphabet) - ), - pynini.concat( - pynini.transducer("", "", output_token_type=syms.alphabet), - map_helper1, - del_cat_ext, pynini.concat( - map_helper1, - del_cat_ext, - pynini.acceptor("", token_type=syms.alphabet) - ).closure(1), - pynini.transducer("", "", output_token_type=syms.alphabet) + pynini.cross("", pynini.string_map(["", ""]).project("input")), + pynini.concat(map_helper2, syms.categories) + ), + pynini.cross("", "") + map_helper1 + del_cat_ext, + pynini.cross("", "") + map_helper1 + del_cat_ext + + map_helper1 + syms.categories + pynini.cross("", ""), + pynini.cross("", "") + map_helper1 + del_cat_ext + + pynini.concat(map_helper1, del_cat_ext + pynini.accep("")).closure(1) + + pynini.cross("", ""), + pynini.cross("", "") + map_helper1 + del_cat_ext + + pynini.concat(map_helper1, syms.categories + pynini.accep("")).closure(1) + + pynini.cross("", pynini.string_map(["", ""]).project("input")) ), - pynini.concat( - pynini.transducer("", "", output_token_type=syms.alphabet), - map_helper1, - del_cat_ext, - pynini.concat( - map_helper1, - syms.categories, - pynini.acceptor("", token_type=syms.alphabet) - ).closure(1), - pynini.transducer("", pynini.string_map(["", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project()) - ) - ), - map_helper1, - ) - ).optimize() - - split_origin_features = pynini.union( - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map( ["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet), - pynini.transducer("", pynini.string_map(["", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), input_token_type=syms.alphabet, output_token_type=syms.alphabet) - ).optimize() - - map_helper3 = pynini.union( - syms.characters, - syms.circumfix_features, - syms.initial_features, - syms.stem_types, - syms.categories, - insert_prefix_suffix_marker, - syms.stem_type_features, - syms.origin_features, - syms.complexity_agreement_features, - insert_complex_lex_entries, - syms.inflection_classes, - self.__split_disjunctive_feats(disjunctive_feat_list), - split_origin_features + map_helper1, + ) ).optimize() - self.__map2 = pynini.concat( - map_helper3.closure(), - pynini.concat( - pynini.transducer("e", "", input_token_type=syms.alphabet, output_token_type=syms.alphabet), + split_origin_features = pynini.union( + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map( ["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")), + pynini.cross("", pynini.string_map(["", "", ""]).project("input")) + ).optimize() + + map_helper3 = pynini.union( + syms.characters, + syms.circumfix_features, + syms.initial_features, + syms.stem_types, + syms.categories, + insert_prefix_suffix_marker, + syms.stem_type_features, + syms.origin_features, + syms.complexity_agreement_features, + insert_complex_lex_entries, + syms.inflection_classes, + self.__split_disjunctive_feats(disjunctive_feat_list), + split_origin_features + ).optimize() + + self.__map2 = pynini.concat( + map_helper3.closure(), pynini.concat( - pynini.string_map(["l", "r"], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project(), + pynini.cross("e", ""), pynini.concat( - pynini.string_map(["", "", "", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project().closure(0,1), + pynini.string_map(["l", "r"]).project("input"), pynini.concat( - pynini.acceptor("", token_type=syms.alphabet), + pynini.string_map(["", "", "", ""]).project("input").closure(0,1), pynini.concat( - pynini.string_map(["", ""], input_token_type=syms.alphabet, output_token_type=syms.alphabet).project().closure(0,1), + pynini.accep(""), pynini.concat( - pynini.acceptor(" ", token_type=syms.alphabet), + pynini.string_map(["", ""]).project("input").closure(0,1), pynini.concat( - insert_complex_lex_entries.closure(0,1), - pynini.acceptor("", token_type=syms.alphabet) + pynini.accep(" "), + pynini.concat( + insert_complex_lex_entries.closure(0,1), + pynini.accep("") + ) ) ) ) ) ) - ) - ).closure(0,1) - ).optimize() - + ).closure(0,1) + ).optimize() @property def map1(self): @@ -251,10 +226,11 @@ def map2(self): return self.__map2 def __split_disjunctive_feats(self, disjunctive_feat_list): - single_splits = [] - for disjunctive_feat in disjunctive_feat_list: - splitted = [] - for cat in disjunctive_feat[1:-1].split(","): - splitted.append("<" + cat + ">") - single_splits.append(pynini.transducer(disjunctive_feat, pynini.string_map(splitted, input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet), input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet)) - return pynini.union(*(single_splits)).optimize() + with pynini.default_token_type(self.__syms.alphabet): + single_splits = [] + for disjunctive_feat in disjunctive_feat_list: + splitted = [] + for cat in disjunctive_feat[1:-1].split(","): + splitted.append("<" + cat + ">") + single_splits.append(pynini.cross(disjunctive_feat, pynini.string_map(splitted))) + return pynini.union(*(single_splits)).optimize() diff --git a/timur/fsts/sublexica.py b/timur/fsts/sublexica.py index f6bc203..0c0bd63 100644 --- a/timur/fsts/sublexica.py +++ b/timur/fsts/sublexica.py @@ -12,48 +12,50 @@ class Sublexica: def __init__(self, syms, lexicon): - # - # store alphabet - self.__syms = syms - - # - # store lexicon - self.__lex = lexicon - - - # - # (private) helpers - self.__sigma_star = pynini.union( - syms.characters, - syms.categories, - syms.stem_types, - syms.stem_type_features, - syms.origin_features, - syms.circumfix_features, - syms.inflection_classes, - syms.geo_inflection_classes, - pynini.acceptor("", token_type=syms.alphabet) # for word-internal (ausgewertet) - ).closure().optimize() - - # - # NoDef2NULL - self.__nodef_to_null = pynini.union( - self.__sigma_star, - syms.origin_features, - pynini.transducer("", "", input_token_type=self.__syms.alphabet), - syms.stem_types - ).closure().optimize() - - # - # sublexica - self.__bdk_stems = self.__construct_bdk_stems() - self.__base_stems = self.__construct_base_stems() - self.__pref_stems = self.__construct_pref_stems() - self.__verbal_pref_stems = self.__construct_verbal_pref_stems() - self.__simplex_suff_stems = self.__construct_simplex_suff_stems() - self.__suff_deriv_suff_stems = self.__construct_suff_deriv_suff_stems() - self.__pref_deriv_suff_stems = self.__construct_pref_deriv_suff_stems() - self.__quant_suff_stems = self.__construct_quant_suff_stems() + with pynini.default_token_type(syms.alphabet): + + # + # store alphabet + self.__syms = syms + + # + # store lexicon + self.__lex = lexicon + + + # + # (private) helpers + self.__sigma_star = pynini.union( + syms.characters, + syms.categories, + syms.stem_types, + syms.stem_type_features, + syms.origin_features, + syms.circumfix_features, + syms.inflection_classes, + syms.geo_inflection_classes, + pynini.accep("") # for word-internal (ausgewertet) + ).closure().optimize() + + # + # NoDef2NULL + self.__nodef_to_null = pynini.union( + self.__sigma_star, + syms.origin_features, + pynini.cross("", ""), + syms.stem_types + ).closure().optimize() + + # + # sublexica + self.__bdk_stems = self.__construct_bdk_stems() + self.__base_stems = self.__construct_base_stems() + self.__pref_stems = self.__construct_pref_stems() + self.__verbal_pref_stems = self.__construct_verbal_pref_stems() + self.__simplex_suff_stems = self.__construct_simplex_suff_stems() + self.__suff_deriv_suff_stems = self.__construct_suff_deriv_suff_stems() + self.__pref_deriv_suff_stems = self.__construct_pref_deriv_suff_stems() + self.__quant_suff_stems = self.__construct_quant_suff_stems() @property def nodef_to_null(self): @@ -122,109 +124,117 @@ def __construct_bdk_stems(self): ''' Base, derivation and compound stems (without derivation suffixes) ''' - return pynini.compose( - self.__lex, - pynini.concat( - self.__syms.initial_features.closure(), - pynini.string_map(["", "", ""], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(), - self.__sigma_star - ) - ).optimize() + with pynini.default_token_type(self.__syms.alphabet): + return pynini.compose( + self.__lex, + pynini.concat( + self.__syms.initial_features.closure(), + pynini.string_map(["", "", ""]).project("input"), + self.__sigma_star + ) + ).optimize() def __construct_base_stems(self): ''' Base stems ''' - return pynini.compose( - self.__bdk_stems, - pynini.concat( - self.__syms.initial_features.closure(), - pynini.acceptor("", token_type=self.__syms.alphabet), - self.__sigma_star - ) - ).optimize() + with pynini.default_token_type(self.__syms.alphabet): + return pynini.compose( + self.__bdk_stems, + pynini.concat( + self.__syms.initial_features.closure(), + pynini.accep(""), + self.__sigma_star + ) + ).optimize() def __construct_pref_stems(self): ''' Prefix stems ''' - return pynini.compose( - self.__lex, - pynini.concat( - self.__syms.initial_features.closure(), - pynini.acceptor("", token_type=self.__syms.alphabet), - self.__sigma_star - ) - ).optimize() + with pynini.default_token_type(self.__syms.alphabet): + return pynini.compose( + self.__lex, + pynini.concat( + self.__syms.initial_features.closure(), + pynini.accep(""), + self.__sigma_star + ) + ).optimize() def __construct_verbal_pref_stems(self): ''' Verbal prefix stems ''' - return pynini.compose( - self.__pref_stems, - pynini.concat( - self.__syms.initial_features.closure(), - pynini.acceptor("", token_type=self.__syms.alphabet), - self.__sigma_star, - pynini.acceptor("", token_type=self.__syms.alphabet), - self.__sigma_star - ) - ).optimize() + with pynini.default_token_type(self.__syms.alphabet): + return pynini.compose( + self.__pref_stems, + pynini.concat( + self.__syms.initial_features.closure(), + pynini.accep(""), + self.__sigma_star, + pynini.accep("", token_type=self.__syms.alphabet), + self.__sigma_star + ) + ).optimize() def __construct_simplex_suff_stems(self): ''' Derivation suffixes which combine with simplex stems ''' - return pynini.compose( - self.__lex, - pynini.concat( - self.__syms.initial_features.closure(), - pynini.acceptor("", token_type=self.__syms.alphabet), - pynini.transducer("", "", input_token_type=self.__syms.alphabet), - self.__sigma_star - ) - ).optimize() + with pynini.default_token_type(self.__syms.alphabet): + return pynini.compose( + self.__lex, + pynini.concat( + self.__syms.initial_features.closure(), + pynini.accep(""), + pynini.cross("", ""), + self.__sigma_star + ) + ).optimize() def __construct_suff_deriv_suff_stems(self): ''' Derivation suffixes which combine with suffixed stems ''' - return pynini.compose( - self.__lex, - pynini.concat( - self.__syms.initial_features.closure(), - pynini.acceptor("", token_type=self.__syms.alphabet), - pynini.transducer("", "", input_token_type=self.__syms.alphabet), - self.__sigma_star - ) - ).optimize() + with pynini.default_token_type(self.__syms.alphabet): + return pynini.compose( + self.__lex, + pynini.concat( + self.__syms.initial_features.closure(), + pynini.accep(""), + pynini.cross("", ""), + self.__sigma_star + ) + ).optimize() def __construct_pref_deriv_suff_stems(self): ''' Derivation suffixes which combine with prefixed stems ''' - return pynini.compose( - self.__lex, - pynini.concat( - self.__syms.initial_features.closure(), - pynini.acceptor("", token_type=self.__syms.alphabet), - pynini.transducer("", "", input_token_type=self.__syms.alphabet), - self.__sigma_star - ) - ).optimize() + with pynini.default_token_type(self.__syms.alphabet): + return pynini.compose( + self.__lex, + pynini.concat( + self.__syms.initial_features.closure(), + pynini.accep(""), + pynini.cross("", ""), + self.__sigma_star + ) + ).optimize() def __construct_quant_suff_stems(self): ''' Derivation suffixes which combine with a number and a simplex stem ''' - return pynini.compose( - self.__lex, - pynini.concat( - pynini.transducer("", "", input_token_type=self.__syms.alphabet), - self.__syms.initial_features.closure(), - pynini.acceptor("", token_type=self.__syms.alphabet), - pynini.transducer("", "", input_token_type=self.__syms.alphabet), - self.__sigma_star - ) - ).optimize() + with pynini.default_token_type(self.__syms.alphabet): + return pynini.compose( + self.__lex, + pynini.concat( + pynini.cross("", ""), + self.__syms.initial_features.closure(), + pynini.accep(""), + pynini.cross("", ""), + self.__sigma_star + ) + ).optimize() diff --git a/timur/fsts/timur_fst.py b/timur/fsts/timur_fst.py index 92ac693..ed9ed03 100644 --- a/timur/fsts/timur_fst.py +++ b/timur/fsts/timur_fst.py @@ -106,7 +106,7 @@ def build(self, lexicon_stream): mappings = fsts.MapFst(self.__syms) # delete certain symbols on the upper and lower level - lex = mappings.map1 * lex * mappings.map2 + lex = mappings.map1 @ lex @ mappings.map2 lex.draw("lex_map.dot", portrait=True) # diff --git a/timur/helpers/helpers.py b/timur/helpers/helpers.py index 9df555b..be379b3 100644 --- a/timur/helpers/helpers.py +++ b/timur/helpers/helpers.py @@ -23,9 +23,11 @@ def load_lexicon(source, symbol_table): tmp.set_final(start) for token in tokenizer.findall(line): if token[1]: - tmp = pynini.concat(tmp, pynini.transducer(token[0], token[1], input_token_type=symbol_table, output_token_type=symbol_table)) + tmp1 = pynini.concat(tmp, pynini.accep(token[0], token_type=symbol_table)) + tmp2 = pynini.concat(tmp, pynini.accep(token[1], token_type=symbol_table)) + tmp = pynini.concat(tmp, pynini.cross(tmp1, tmp2)) else: - tmp = pynini.concat(tmp, pynini.acceptor(token[0], token_type=symbol_table)) + tmp = pynini.concat(tmp, pynini.accep(token[0], token_type=symbol_table)) lex = pynini.union(lex, tmp) return lex diff --git a/timur/symbols/symbols.py b/timur/symbols/symbols.py index 5b62bc9..b493880 100644 --- a/timur/symbols/symbols.py +++ b/timur/symbols/symbols.py @@ -43,44 +43,44 @@ def __init__(self, alphabet): else: chars_to_lower.append((symbol, symbol)) chars_to_upper.append((symbol, symbol)) - self.__characters = pynini.string_map(chars, input_token_type=alphabet, output_token_type=alphabet).project().optimize() - self.__characters_upper = pynini.string_map(chars_upper, input_token_type=alphabet, output_token_type=alphabet).project().optimize() - self.__characters_lower = pynini.string_map(chars_lower, input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__characters = pynini.string_map(chars, input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() + self.__characters_upper = pynini.string_map(chars_upper, input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() + self.__characters_lower = pynini.string_map(chars_lower, input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() self.__characters_to_upper = pynini.string_map(chars_to_upper, input_token_type=alphabet, output_token_type=alphabet).optimize() self.__characters_to_lower = pynini.string_map(chars_to_lower, input_token_type=alphabet, output_token_type=alphabet).optimize() - self.__consonants_lower = pynini.string_map(["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", "ß"], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__consonants_lower = pynini.string_map(["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z", "ß"], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__consonants_upper = pynini.string_map(["B", "C", "D", "F", "G", "H", "J", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "X", "Y", "Z"], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__consonants_upper = pynini.string_map(["B", "C", "D", "F", "G", "H", "J", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "X", "Y", "Z"], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() self.__consonants = self.__consonants_lower | self.__consonants_upper - self.__inititial_features = pynini.string_map(["", "", "", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__inititial_features = pynini.string_map(["", "", "", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__categories = pynini.string_map(["", "", "", "", "", "", "", "", "", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__categories = pynini.string_map(["", "", "", "", "", "", "", "", "", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__disjunctive_categories = pynini.string_map(["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__disjunctive_categories = pynini.string_map(["", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__base_stem_types = pynini.string_map(["", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__base_stem_types = pynini.string_map(["", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__stem_types = pynini.string_map(["", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__stem_types = pynini.string_map(["", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__prefix_suffix_marker = pynini.string_map(["", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__prefix_suffix_marker = pynini.string_map(["", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__stem_type_features = pynini.string_map(["", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__stem_type_features = pynini.string_map(["", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__complexity_agreement_features = pynini.string_map(["", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__complexity_agreement_features = pynini.string_map(["", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__complexity_entry_features = pynini.string_map(["", "", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__complexity_entry_features = pynini.string_map(["", "", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__origin_features = pynini.string_map(["", "", "", "", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__origin_features = pynini.string_map(["", "", "", "", "", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__circumfix_features = pynini.string_map([""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__circumfix_features = pynini.string_map([""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() self.__ns_features = pynini.string_map(["", "", "", "", "", ""], - input_token_type=alphabet, output_token_type=alphabet).project().optimize() + input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() self.__geo_inflection_classes = pynini.string_map(["", "", "", @@ -132,7 +132,7 @@ def __init__(self, alphabet): "", "", "", - "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() self.__inflection_classes = pynini.string_map([ "", "", "", "", @@ -192,9 +192,9 @@ def __init__(self, alphabet): "", "", "", "", "", "", "", "", "", "", "", "", - "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() - self.__gender = pynini.string_map(["", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project().optimize() + self.__gender = pynini.string_map(["", "", "", ""], input_token_type=alphabet, output_token_type=alphabet).project("input").optimize() # # access to the alphabet (pynini.SymbolTable)