Disable custom_dict for nlpo3 engine in word_tokenize()

bact · web-flow · commit 9610b543c814 · 2023-12-11T04:23:34.000Z
Currently cannot handle custom_dict from inside word_tokenize(), due to difference in type.
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -128,7 +128,7 @@ def word_tokenize(
 
     :param str text: text to be tokenized
     :param str engine: name of the tokenizer to be used
-    :param pythainlp.util.Trie custom_dict: dictionary trie
+    :param pythainlp.util.Trie custom_dict: dictionary trie (some engine may not support)
     :param bool keep_whitespace: True to keep whitespace, a common mark
                                  for end of phrase in Thai.
                                  Otherwise, whitespace is omitted.
@@ -290,18 +290,20 @@ def word_tokenize(
         segments = segment(text)
     elif engine == "nlpo3":
         from pythainlp.tokenize.nlpo3 import segment
-
-        if isinstance(custom_dict, str):
-            segments = segment(text, custom_dict=custom_dict)
-        elif not isinstance(custom_dict, str) and not custom_dict:
-            raise ValueError(
-                f"""Tokenizer \"{engine}\":
-                custom_dict must be a str.
-                It is a dictionary name as assigned with load_dict().
-                See pythainlp.tokenize.nlpo3.load_dict()"""
-            )
-        else:
-            segments = segment(text)
+        # Currently cannot handle custom_dict from inside word_tokenize(),
+        # due to difference in type.
+        #if isinstance(custom_dict, str):
+        #    segments = segment(text, custom_dict=custom_dict)
+        #elif not isinstance(custom_dict, str) and not custom_dict:
+        #    raise ValueError(
+        #        f"""Tokenizer \"{engine}\":
+        #        custom_dict must be a str.
+        #        It is a dictionary name as assigned with load_dict().
+        #        See pythainlp.tokenize.nlpo3.load_dict()"""
+        #    )
+        #else:
+        #    segments = segment(text)
+        segments = segment(text)
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.