Migrate to new pynini interface (in progress)

wrznr · Jul 23, 2021 · 834701f · 834701f
1 parent b7fac1d
commit 834701f
Show file tree

Hide file tree

Showing 7 changed files with 345 additions and 356 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 click
 pytest
 pytest-ordering
-http://www.opengrm.org/twiki/pub/GRM/PyniniDownload/pynini-2.0.5.tar.gz#egg=pynini
+Cython
+http://www.opengrm.org/twiki/pub/GRM/PyniniDownload/pynini-2.1.3.tar.gz#egg=pynini
diff --git a/timur/fsts/deko_fst.py b/timur/fsts/deko_fst.py
@@ -349,7 +349,7 @@ def __construct_del_ge(self):
     Case-dependent deletion of the ge marker
     '''
 
-    # delete <ge> at certain suffixes like 'ver'
+    # delete <ge> at certain prefixes like 'ver'
     return pynini.concat(
         pynini.transducer("<no-ge>", "", input_token_type=self.__syms.alphabet),
         pynini.concat(

diff --git a/timur/fsts/map_fst.py b/timur/fsts/map_fst.py
diff --git a/timur/fsts/sublexica.py b/timur/fsts/sublexica.py
@@ -12,48 +12,50 @@ class Sublexica:
 
   def __init__(self, syms, lexicon):
 
-    #
-    # store alphabet
-    self.__syms = syms
-
-    #
-    # store lexicon
-    self.__lex = lexicon
-
-
-    #
-    # (private) helpers
-    self.__sigma_star = pynini.union(
-        syms.characters,
-        syms.categories,
-        syms.stem_types,
-        syms.stem_type_features,
-        syms.origin_features,
-        syms.circumfix_features,
-        syms.inflection_classes,
-        syms.geo_inflection_classes,
-        pynini.acceptor("<ge>", token_type=syms.alphabet) # for word-internal <ge> (ausgewertet)
-        ).closure().optimize()
-
-    #
-    # NoDef2NULL
-    self.__nodef_to_null = pynini.union(
-        self.__sigma_star,
-        syms.origin_features,
-        pynini.transducer("<NoDef>", "", input_token_type=self.__syms.alphabet),
-        syms.stem_types
-        ).closure().optimize()
-
-    #
-    # sublexica
-    self.__bdk_stems = self.__construct_bdk_stems()
-    self.__base_stems = self.__construct_base_stems()
-    self.__pref_stems = self.__construct_pref_stems()
-    self.__verbal_pref_stems = self.__construct_verbal_pref_stems()
-    self.__simplex_suff_stems = self.__construct_simplex_suff_stems()
-    self.__suff_deriv_suff_stems = self.__construct_suff_deriv_suff_stems()
-    self.__pref_deriv_suff_stems = self.__construct_pref_deriv_suff_stems()
-    self.__quant_suff_stems = self.__construct_quant_suff_stems()
+    with pynini.default_token_type(syms.alphabet):
+
+      #
+      # store alphabet
+      self.__syms = syms
+
+      #
+      # store lexicon
+      self.__lex = lexicon
+
+
+      #
+      # (private) helpers
+      self.__sigma_star = pynini.union(
+          syms.characters,
+          syms.categories,
+          syms.stem_types,
+          syms.stem_type_features,
+          syms.origin_features,
+          syms.circumfix_features,
+          syms.inflection_classes,
+          syms.geo_inflection_classes,
+          pynini.accep("<ge>") # for word-internal <ge> (ausgewertet)
+          ).closure().optimize()
+
+      #
+      # NoDef2NULL
+      self.__nodef_to_null = pynini.union(
+          self.__sigma_star,
+          syms.origin_features,
+          pynini.cross("<NoDef>", ""),
+          syms.stem_types
+          ).closure().optimize()
+
+      #
+      # sublexica
+      self.__bdk_stems = self.__construct_bdk_stems()
+      self.__base_stems = self.__construct_base_stems()
+      self.__pref_stems = self.__construct_pref_stems()
+      self.__verbal_pref_stems = self.__construct_verbal_pref_stems()
+      self.__simplex_suff_stems = self.__construct_simplex_suff_stems()
+      self.__suff_deriv_suff_stems = self.__construct_suff_deriv_suff_stems()
+      self.__pref_deriv_suff_stems = self.__construct_pref_deriv_suff_stems()
+      self.__quant_suff_stems = self.__construct_quant_suff_stems()
 
   @property
   def nodef_to_null(self):
@@ -122,109 +124,117 @@ def __construct_bdk_stems(self):
     '''
     Base, derivation and compound stems (without derivation suffixes)
     '''
-    return pynini.compose(
-        self.__lex,
-        pynini.concat(
-          self.__syms.initial_features.closure(),
-          pynini.string_map(["<Base_Stems>", "<Deriv_Stems>", "<Kompos_Stems>"], input_token_type=self.__syms.alphabet, output_token_type=self.__syms.alphabet).project(),
-          self.__sigma_star
-          )
-        ).optimize()
+    with pynini.default_token_type(self.__syms.alphabet):
+      return pynini.compose(
+          self.__lex,
+          pynini.concat(
+            self.__syms.initial_features.closure(),
+            pynini.string_map(["<Base_Stems>", "<Deriv_Stems>", "<Kompos_Stems>"]).project("input"),
+            self.__sigma_star
+            )
+          ).optimize()
 
   def __construct_base_stems(self):
     '''
     Base stems
     '''
-    return pynini.compose(
-        self.__bdk_stems,
-        pynini.concat(
-          self.__syms.initial_features.closure(),
-          pynini.acceptor("<Base_Stems>", token_type=self.__syms.alphabet),
-          self.__sigma_star
-          )
-        ).optimize()
+    with pynini.default_token_type(self.__syms.alphabet):
+      return pynini.compose(
+          self.__bdk_stems,
+          pynini.concat(
+            self.__syms.initial_features.closure(),
+            pynini.accep("<Base_Stems>"),
+            self.__sigma_star
+            )
+          ).optimize()
 
   def __construct_pref_stems(self):
     '''
     Prefix stems
     '''
-    return pynini.compose(
-        self.__lex,
-        pynini.concat(
-          self.__syms.initial_features.closure(),
-          pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet),
-          self.__sigma_star
-          )
-        ).optimize()
+    with pynini.default_token_type(self.__syms.alphabet):
+      return pynini.compose(
+          self.__lex,
+          pynini.concat(
+            self.__syms.initial_features.closure(),
+            pynini.accep("<Pref_Stems>"),
+            self.__sigma_star
+            )
+          ).optimize()
 
   def __construct_verbal_pref_stems(self):
     '''
     Verbal prefix stems
     '''
-    return pynini.compose(
-        self.__pref_stems,
-        pynini.concat(
-          self.__syms.initial_features.closure(),
-          pynini.acceptor("<Pref_Stems>", token_type=self.__syms.alphabet),
-          self.__sigma_star,
-          pynini.acceptor("<V>", token_type=self.__syms.alphabet),
-          self.__sigma_star
-          )
-        ).optimize()
+    with pynini.default_token_type(self.__syms.alphabet):
+      return pynini.compose(
+          self.__pref_stems,
+          pynini.concat(
+            self.__syms.initial_features.closure(),
+            pynini.accep("<Pref_Stems>"),
+            self.__sigma_star,
+            pynini.accep("<V>", token_type=self.__syms.alphabet),
+            self.__sigma_star
+            )
+          ).optimize()
 
   def __construct_simplex_suff_stems(self):
     '''
     Derivation suffixes which combine with simplex stems
     '''
-    return pynini.compose(
-        self.__lex,
-        pynini.concat(
-          self.__syms.initial_features.closure(),
-          pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet),
-          pynini.transducer("<simplex>", "", input_token_type=self.__syms.alphabet),
-          self.__sigma_star
-          )
-        ).optimize()
+    with pynini.default_token_type(self.__syms.alphabet):
+      return pynini.compose(
+          self.__lex,
+          pynini.concat(
+            self.__syms.initial_features.closure(),
+            pynini.accep("<Suff_Stems>"),
+            pynini.cross("<simplex>", ""),
+            self.__sigma_star
+            )
+          ).optimize()
 
   def __construct_suff_deriv_suff_stems(self):
     '''
     Derivation suffixes which combine with suffixed stems
     '''
-    return pynini.compose(
-        self.__lex,
-        pynini.concat(
-          self.__syms.initial_features.closure(),
-          pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet),
-          pynini.transducer("<suffderiv>", "", input_token_type=self.__syms.alphabet),
-          self.__sigma_star
-          )
-        ).optimize()
+    with pynini.default_token_type(self.__syms.alphabet):
+      return pynini.compose(
+          self.__lex,
+          pynini.concat(
+            self.__syms.initial_features.closure(),
+            pynini.accep("<Suff_Stems>"),
+            pynini.cross("<suffderiv>", ""),
+            self.__sigma_star
+            )
+          ).optimize()
 
   def __construct_pref_deriv_suff_stems(self):
     '''
     Derivation suffixes which combine with prefixed stems
     '''
-    return pynini.compose(
-        self.__lex,
-        pynini.concat(
-          self.__syms.initial_features.closure(),
-          pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet),
-          pynini.transducer("<prefderiv>", "", input_token_type=self.__syms.alphabet),
-          self.__sigma_star
-          )
-        ).optimize()
+    with pynini.default_token_type(self.__syms.alphabet):
+      return pynini.compose(
+          self.__lex,
+          pynini.concat(
+            self.__syms.initial_features.closure(),
+            pynini.accep("<Suff_Stems>"),
+            pynini.cross("<prefderiv>", ""),
+            self.__sigma_star
+            )
+          ).optimize()
 
   def __construct_quant_suff_stems(self):
     '''
     Derivation suffixes which combine with a number and a simplex stem
     '''
-    return pynini.compose(
-        self.__lex,
-        pynini.concat(
-          pynini.transducer("<QUANT>", "", input_token_type=self.__syms.alphabet),
-          self.__syms.initial_features.closure(),
-          pynini.acceptor("<Suff_Stems>", token_type=self.__syms.alphabet),
-          pynini.transducer("<simplex>", "", input_token_type=self.__syms.alphabet),
-          self.__sigma_star
-          )
-        ).optimize()
+    with pynini.default_token_type(self.__syms.alphabet):
+      return pynini.compose(
+          self.__lex,
+          pynini.concat(
+            pynini.cross("<QUANT>", ""),
+            self.__syms.initial_features.closure(),
+            pynini.accep("<Suff_Stems>"),
+            pynini.cross("<simplex>", ""),
+            self.__sigma_star
+            )
+          ).optimize()
diff --git a/timur/fsts/timur_fst.py b/timur/fsts/timur_fst.py
@@ -106,7 +106,7 @@ def build(self, lexicon_stream):
     mappings = fsts.MapFst(self.__syms)
 
     # delete certain symbols on the upper and lower level
-    lex = mappings.map1 * lex * mappings.map2
+    lex = mappings.map1 @ lex @ mappings.map2
     lex.draw("lex_map.dot", portrait=True)
 
     #

diff --git a/timur/helpers/helpers.py b/timur/helpers/helpers.py
@@ -23,9 +23,11 @@ def load_lexicon(source, symbol_table):
       tmp.set_final(start)
       for token in tokenizer.findall(line):
         if token[1]:
-          tmp = pynini.concat(tmp, pynini.transducer(token[0], token[1], input_token_type=symbol_table, output_token_type=symbol_table))
+          tmp1 = pynini.concat(tmp, pynini.accep(token[0], token_type=symbol_table))
+          tmp2 = pynini.concat(tmp, pynini.accep(token[1], token_type=symbol_table))
+          tmp = pynini.concat(tmp, pynini.cross(tmp1, tmp2))
         else:
-          tmp = pynini.concat(tmp, pynini.acceptor(token[0], token_type=symbol_table))
+          tmp = pynini.concat(tmp, pynini.accep(token[0], token_type=symbol_table))
       lex = pynini.union(lex, tmp)
   return lex