add case insensitive support and tokenization pos information

u8621011 · Aug 8, 2018 · 729d7c6 · 729d7c6
1 parent 0331861
commit 729d7c6
Show file tree

Hide file tree

Showing 5 changed files with 56 additions and 15 deletions.
diff --git a/pyVitk/Lexicon.py b/pyVitk/Lexicon.py
@@ -68,7 +68,7 @@ class Lexicon(object):
     """
     The tokenizer lexicon dictionary
     """
-    def __init__(self, default=True, case_sensitive=True):
+    def __init__(self, default=True, case_sensitive=False):
         """
         The initializer of lexicon
         :param default: True to load default defined lexicon xml file.

diff --git a/pyVitk/Tokenizer.py b/pyVitk/Tokenizer.py
@@ -256,7 +256,7 @@ def segment(self, sentence: str, concat=False) -> list:
                 string_left_trimmed = s[max_matched_len:].lstrip()
 
                 # process the token we found
-                if 'name' in token_type and len(string_left_trimmed) > 0:
+                if 'name' in token_type and len(string_left_trimmed) > 0:   # we have dropped the name regex pattern.
                     tup = self.processName(next_token, string_left_trimmed)
                     if len(tup[0]) != len(next_token):
                         next_token = tup[0]
@@ -420,8 +420,8 @@ class Tokenizer(object):
     """Vietnamese Tokenizer
     """
 
-    def __init__(self, lexicon_src = None, regexp_src = None, bigramFilename = None,
-                whilespaceModelFilename=None):
+    def __init__(self, lexicon_src=None, regexp_src=None, bigramFilename=None,
+                whilespaceModelFilename=None, case_sensitive=False):
         """Construct the tokenizer
 
         Parameters
@@ -438,19 +438,19 @@ def __init__(self, lexicon_src = None, regexp_src = None, bigramFilename = None,
         this_dir, this_filename = os.path.split(__file__)
 
         # use default files if not provided
-        if not lexicon_src:
-            lexicon_src = os.path.join(this_dir, 'dat/tok/lexicon.xml')
         if not regexp_src:
             regexp_src = os.path.join(this_dir, 'dat/tok/regexp-py.txt')
         if not bigramFilename:
             bigramFilename = os.path.join(this_dir, 'dat/tok/syllables2M.arpa')
-
-        self.lexicon = Lexicon()
 
-        if type(lexicon_src) is str:
-            self.lexicon.load(lexicon_src)
+        if lexicon_src:
+            self.lexicon = Lexicon(default=False, case_sensitive=case_sensitive)
+            if type(lexicon_src) is str:
+                self.lexicon.load(lexicon_src)
+            else:
+                self.lexicon.loadFromList(lexicon_src)
         else:
-            self.lexicon.loadFromList(lexicon_src)
+            self.lexicon = Lexicon(case_sensitive=case_sensitive)
 
         self.classifier = None
         self.graph = PhraseGraph(self)

diff --git a/pyVitk/dat/tok/regexp-py.txt b/pyVitk/dat/tok/regexp-py.txt
@@ -1,14 +1,22 @@
 # example: i am a lowercase string.
-01:phrase	[\p{Lu}]{0,1}[\p{Ll}]+(?:\s[\p{Ll}]+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))
-#01:phrase	[\p{Ll}]+(?:\s[\p{Ll}]+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))  # fix for tailing space error
+01:phrase	\w+(?:\s\w+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))
+
+# for begin char upper case allowed.
+#01:phrase	[\p{Lu}]{0,1}[\p{Ll}]+(?:\s[\p{Ll}]+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))
+
+# fix for tailing space error
+#01:phrase	[\p{Ll}]+(?:\s[\p{Ll}]+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))
 #01:phrase	[\p{Ll}\s]+(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))
 # 01:phrase [aáàảãạăắằẳẵặâấầẩẫậbcdđeéèẻẽẹêếềểễệfghiíìỉĩịjklmnoóòỏõọôốồổỗộơớờởỡợpqrstuúùủũụưứừửữựvwxyýỳỷỹỵz\s]+(?=[\s\.,\!\?\-\:;"“”'\(\)]+|$)(?<!http)(?<!https)<?<!ftp)
 # the original pattern for java won't match string likes http:\\www.google.com, but python version will match and capture string except http/https/ftp string. may be we should adjust the algorithm to do special pattern search before phrase patten.
 # 01:phrase	[aáàảãạăắằẳẵặâấầẩẫậbcdđeéèẻẽẹêếềểễệfghiíìỉĩịjklmnoóòỏõọôốồổỗộơớờởỡợpqrstuúùủũụưứừửữựvwxyýỳỷỹỵz\s]+(?=[\s\.,\!\?\-\:;"“”'\(\)]+|$)
 
 # example: AAA.A
 #02:allcaps	([A-Z]+)([A-Z\.])*[^a-z\)\W]
-02:allcaps	([\p{Lu}]+)([\p{Lu}\.])*[^\p{Ll}\)]
+#02:allcaps	([\p{Lu}]+)([\p{Lu}\.])*[^\p{Ll}\)]
+
+# fix the space tailing problem
+02:allcaps	([\p{Lu}]+)([\p{Lu}\.])*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)
 
 # complete latin char [a-zA-Z]
 # example: 29292ADB222

diff --git a/pyVitk/test/logging.json → pyVitk/logging.json b/pyVitk/test/logging.json → pyVitk/logging.json
@@ -15,6 +15,16 @@
             "stream": "ext://sys.stdout"
         },
 
+        "pyVity_file_handler": {
+            "class": "logging.handlers.RotatingFileHandler",
+            "level": "DEBUG",
+            "formatter": "simple",
+            "filename": "pyVitk.log",
+            "maxBytes": 10485760,
+            "backupCount": 20,
+            "encoding": "utf8"
+        },
+
         "rotating_file_handler": {
             "class": "logging.handlers.RotatingFileHandler",
             "level": "DEBUG",
@@ -37,6 +47,11 @@
     },
 
     "loggers": {
+        "pyVitk": {
+            "level": "DEBUG",
+            "handlers": ["console", "pyVity_file_handler"],
+            "propagate": "yes"
+        },
         "unittest": {
             "level": "DEBUG",
             "handlers": ["console", "unittest_file_handler"],

diff --git a/pyVitk/test/TokenizerTest.py b/pyVitk/test/TokenizerTest.py
@@ -14,7 +14,8 @@ class TokenzierTestCase(TestCase):
     @classmethod
     def setUpClass(cls):
         logger.debug('Start TokenizerTest')
-        cls.tokenizer = Tokenizer()
+        cls.tokenizer = Tokenizer(case_sensitive=True)
+        cls.tokenizer_insensitive = Tokenizer(case_sensitive=False)
 
     @classmethod
     def tearDownClass(cls):
@@ -160,5 +161,22 @@ def test_tokenize_line9(self):
         logger.debug('Test tokenizeline, s: ' + s)
         logger.debug('Tokenized Result: {}'.format(t))
 
+        t = self.tokenizer_insensitive.tokenizeLine(s)
+        logger.debug('Test insensitive tokenizeline, s: ' + s)
+        logger.debug('Tokenized Result: {}'.format(t))
+
+    def test_tokenize_line10(self):
+        s = "PHD | Xem Đi Xem Lại Cả 1000 Lần Mà Vẫn Không Thể Nhịn Được Cười | Funny Videos,"
+        t = self.tokenizer.tokenizeLine(s)
+        logger.debug('Test tokenizeline, s: ' + s)
+        logger.debug('Tokenized Result: {}'.format(t))
+
+    def test_tokenize_line11(self):
+        s = "BÍCH PHƯƠNG - Bao Giờ Lấy Chồng? [OFFICIAL M/V]"
+        t = self.tokenizer.tokenizeLine(s)
+        logger.debug('Test tokenizeline, s: ' + s)
+        logger.debug('Tokenized Result: {}'.format(t))
+
+
     def test_serialize_to_xml(self):
         self.tokenizer.to_lexicon_xml_file('test.xml')