tokenize line into LexiconToken strucutre (to save additional positio…

…n information)
u8621011 · Aug 7, 2018 · f623387 · f623387
1 parent 6ed4e25
commit f623387
Show file tree

Hide file tree

Showing 3 changed files with 111 additions and 85 deletions.
diff --git a/pyVitk/DataStructure.py b/pyVitk/DataStructure.py
@@ -0,0 +1,13 @@
+# encoding=utf-8
+import json
+
+
+class LexiconToken(object):
+    def __init__(self, type, text, start_char_pos, end_char_pos):
+        self.type = type
+        self.text = text
+        self.start_char_pos = start_char_pos
+        self.end_char_pos = end_char_pos
+
+    def __repr__(self):
+        return json.dumps(self, default=lambda o: o.__dict__,  ensure_ascii=False, indent=4)
diff --git a/pyVitk/Tokenizer.py b/pyVitk/Tokenizer.py
@@ -8,6 +8,8 @@
 from .Bigrams import Bigrams
 from .Dijkstra import Dijkstra
 from .Dijkstra2 import Graph, shortest_path
+from pyVitk.DataStructure import LexiconToken
+
 
 logger = logging.getLogger(__name__)
 
@@ -136,7 +138,7 @@ def shortestPaths(self) -> list:
 
         return allPaths
 
-    def words(self, path: list) -> list:
+    def words(self, path: list, concat=False) -> list:
         """
         Gets a list of words specified by a given path.
         :param path: 
@@ -153,7 +155,10 @@ def words(self, path: list) -> list:
             lstSyllables = list()
             lstSyllables.append(self.syllables[i])
             for k in range(a[j] + 1, a[j + 1]):
-                lstSyllables.append(' ')
+                if concat:
+                    lstSyllables.append('_')
+                else:
+                    lstSyllables.append(' ')
                 lstSyllables.append((self.syllables[k]))
             tok.append(lstSyllables)
 
@@ -200,84 +205,110 @@ class SegmentationFunction(object):
     def __init__(self, tokenizer: 'Tokenizer'):
         self.tokenizer = tokenizer
 
-    def segment(self, sentence: str) -> list:
+    def segment(self, sentence: str, concat=False) -> list:
+        """
+        Segment the passed in sentence, caller must not call sentence.strip, segmentation function will do it to calculate the pricise term position in sentence.
+        :param sentence:
+        :return:
+        """
         tokens = []
 
-        sentence = sentence.strip()
-        if len(sentence) == 0:
+        s = sentence.lstrip()
+        if len(s) == 0:
             return tokens
 
-        s = sentence
+        # the first position of non-space char.
+        cur_pos = len(sentence) - len(s)
+
+        token_start_pos_base = cur_pos
         while True:
-            maxLen = 0
-            nextToken = None
-            tokenType = None
+            max_matched_len = 0
+            next_token = None
+            token_type = None
 
             # greedy search for the longest pattern from the beginning of 's'
-            logger.debug('Segmenting sentence: ' + s)
+            logger.debug('Segmenting sentence: %s', s)
             for k, p in self.tokenizer.patterns.items():
                 # return matchobject if matches
                 m = p.match(s)
                 if m:
-                    logger.debug('【{}】 matched: {}'.format(s, k))
-                    curLen = m.end(0) - m.start(0)
-                    if maxLen < curLen:
-                        maxLen = curLen
-                        nextToken = m.group(0)
-                        tokenType = k
-                        nextToken = nextToken.strip()
-                else:
-                    #logger.debug('Test {} with pattern {}: not matched'.format(s, p))
-                    pass
-
-            if tokenType:
-                logger.debug('longest matched pattern type: {}, token: {}'.format(tokenType, nextToken))
+                    logger.debug('【%s】 matched pattern: %s', s, k)
+                    cur_matched_len = m.end(0) - m.start(0)
+                    if max_matched_len < cur_matched_len:
+                        max_matched_len = cur_matched_len
+                        next_token = m.group(0)
+                        token_type = k
+
+                        # what pattern will have space in the begin or end of matched term?
+                        stripped_token = next_token.strip()
+                        if len(stripped_token) != len(next_token):
+                            logger.info('Found the case of token need to be stripped, token: [%s]', next_token)
+                            raise Exception('Found the case of token need to be stripped')
+
+            if token_type:
+                logger.debug('longest matched pattern type: %s, token: %s', token_type, next_token)
             else:
-                logger.debug('Cannot find the matched pattern.')
+                logger.debug('Cannot find the matched pattern. sentence: %s', s)
 
             # split off the longest token we found.
-            if nextToken:
-                s = s[maxLen:].strip()
+            if next_token:
+                string_left = s[max_matched_len:]
+                string_left_trimmed = s[max_matched_len:].lstrip()
+
                 # process the token we found
-                if 'name' in tokenType and len(s) > 0:
-                    tup = self.processName(nextToken, s)
-                    if len(tup[0]) != len(nextToken):
-                        nextToken = tup[0]
-                        s = tup[1]
-                        tokenType = 'word'
-
-                    logger.debug('appending new token, type: {}, token: {}'.format(tokenType, nextToken))
-                    tokens.append((tokenType, nextToken))
-                elif 'unit' in tokenType and len(s) > 0:
-                    tup = self.processUnit(nextToken, s)
-                    if len(tup[0]) > len(nextToken):
-                        nextToken = tup[0]
-                        s = tup[1]
-                        tokenType = "unit"
-
-                    logger.debug('appending new token, type: {}, token: {}'.format(tokenType, nextToken))
-                    tokens.append((tokenType, nextToken))
-                elif 'phrase' in tokenType:
-                    if nextToken.find(' ') > 0:    # multi-syllabic phrase
-                        if  self.tokenizer.classifier is not None:
+                if 'name' in token_type and len(string_left_trimmed) > 0:
+                    tup = self.processName(next_token, string_left_trimmed)
+                    if len(tup[0]) != len(next_token):
+                        next_token = tup[0]
+                        s = string_left_trimmed = tup[1]
+                        token_type = 'word'
+
+                    logger.debug('appending new token, type: {}, token: {}'.format(token_type, next_token))
+                    tokens.append((token_type, next_token))
+                elif 'unit' in token_type and len(string_left_trimmed) > 0:
+                    tup = self.processUnit(next_token, string_left_trimmed)
+                    if len(tup[0]) > len(next_token):
+                        next_token = tup[0]
+                        s = string_left_trimmed = tup[1]
+                        token_type = "unit"
+
+                    logger.debug('appending new token, type: {}, token: {}'.format(token_type, next_token))
+                    tokens.append((token_type, next_token))
+                elif 'phrase' in token_type:
+                    if next_token.find(' ') > 0:    # multi-syllabic phrase
+                        if self.tokenizer.classifier is not None:
                             raise NotImplementedError
                         else:
                             # segment the phrase using a phrase graph
-                            words = self.tokenizePhrase(nextToken)
+                            words = self.tokenizePhrase(next_token)
                             #words = self.tokenizePhrase2(nextToken)
                             if words is not None:
+                                token_start_pos = token_start_pos_base
                                 for i in range(len(words)):
-                                    logger.debug(
-                                        'appending new token, type: {}, token: {}'.format(tokenType, words[i]))
-                                    tokens.append(("word", words[i]))
+                                    if i != 0:
+                                        token_start_pos = token_end_pos + 1   # every token must be split by one space
+                                    token_end_pos = token_start_pos + len(words[i])
+                                    t = LexiconToken(type="word", text=words[i], start_char_pos=token_start_pos
+                                                     , end_char_pos=token_end_pos)
+                                    tokens.append(t)
                             else:
-                                logger.info('Error when tokenizing phrase: ' + nextToken)
+                                raise Exception('Error when tokenizing phrase: ' + next_token)
                     else:
-                        logger.debug(
-                            'appending new token, type: {}, token: {}'.format('word', nextToken))
-                        tokens.append(("word", nextToken))
+                        token_start_pos = token_start_pos_base
+                        token_end_pos = token_start_pos_base + max_matched_len
+                        t = LexiconToken(type="word", text=next_token, start_char_pos=token_start_pos,
+                                         end_char_pos=token_end_pos)
+                        tokens.append(t)
+                    s = string_left_trimmed
                 else:
-                    tokens.append((tokenType, nextToken))
+                    token_start_pos = token_start_pos_base
+                    token_end_pos = token_start_pos_base + max_matched_len
+                    t = LexiconToken(type=token_type, text=next_token, start_char_pos=token_start_pos,
+                                     end_char_pos=token_end_pos)
+                    tokens.append(t)
+                    s = string_left_trimmed
+                    token_start_pos_base = token_end_pos + (
+                                len(string_left) - len(string_left_trimmed))  # the next base
             else:
                 if len(s.strip()) > 0:
                     logger.warning('Unprocessed substring: ' + s)
@@ -486,28 +517,6 @@ def tokenize(self, inputFilename, outputFilename):
     def tokenizeLine(self, line: str, concat=False) -> list:
         seg = SegmentationFunction(self)
 
-        line = line.strip()
+        tokens = seg.segment(line, concat)
 
-        tokens = seg.segment(line)
-        lstRet = list()
-
-        for i in range(len(tokens)):
-            curToken = tokens[i]
-            if concat:
-                if (curToken[0] == 'phrase' or curToken[0] == 'word'):
-                    lstRet.append({
-                        'Type': curToken[0],
-                        'Text': curToken[1].replace(' ', '_')
-                    })
-                else:
-                    lstRet.append({
-                        'Type': curToken[0],
-                        'Text': curToken[1],
-                    })
-            else:
-                lstRet.append({
-                    'Type': curToken[0],
-                    'Text': curToken[1],
-                })
-
-        return lstRet
+        return tokens
diff --git a/pyVitk/dat/tok/regexp-py.txt b/pyVitk/dat/tok/regexp-py.txt
@@ -1,11 +1,14 @@
 # example: i am a lowercase string.
-01:phrase	[\p{Ll}\s]+(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))
+01:phrase	[\p{Lu}]{0,1}[\p{Ll}]+(?:\s[\p{Ll}]+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))
+#01:phrase	[\p{Ll}]+(?:\s[\p{Ll}]+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))  # fix for tailing space error
+#01:phrase	[\p{Ll}\s]+(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))
 # 01:phrase [aáàảãạăắằẳẵặâấầẩẫậbcdđeéèẻẽẹêếềểễệfghiíìỉĩịjklmnoóòỏõọôốồổỗộơớờởỡợpqrstuúùủũụưứừửữựvwxyýỳỷỹỵz\s]+(?=[\s\.,\!\?\-\:;"“”'\(\)]+|$)(?<!http)(?<!https)<?<!ftp)
 # the original pattern for java won't match string likes http:\\www.google.com, but python version will match and capture string except http/https/ftp string. may be we should adjust the algorithm to do special pattern search before phrase patten.
 # 01:phrase	[aáàảãạăắằẳẵặâấầẩẫậbcdđeéèẻẽẹêếềểễệfghiíìỉĩịjklmnoóòỏõọôốồổỗộơớờởỡợpqrstuúùủũụưứừửữựvwxyýỳỷỹỵz\s]+(?=[\s\.,\!\?\-\:;"“”'\(\)]+|$)
 
 # example: AAA.A
-02:allcaps	([A-Z]+)([A-Z\.])*[^a-z\)\W]
+#02:allcaps	([A-Z]+)([A-Z\.])*[^a-z\)\W]
+02:allcaps	([\p{Lu}]+)([\p{Lu}\.])*[^\p{Ll}\)]
 
 # complete latin char [a-zA-Z]
 # example: 29292ADB222
@@ -19,7 +22,8 @@
 # 03:entity2	^[a-zA-ZáàảãạăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệíìỉĩịýỳỷỹỵóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựÁÀẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÍÌỈĨỊÝỲỶỸỴÓÒỎÕỌÔỐỒỔỠỢÚÙỦŨỤƯỨỪỬỮỰ]+(\d)+
 
 # example: ADFA11ADD333
-03:entity3	([\p{Lu}]+\d*)+
+03:entity3	([\p{Lu}]+\d+)+
+#03:entity3	([\p{Lu}]+\d*)+
 # below is the RE module version
 #03:entity3	^([A-ZÁÀẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÍÌỈĨỊÝỲỶỸỴÓÒỎÕỌÔỐỒỔỠỢÚÙỦŨỤƯỨỪỬỮỰ]+\d*)+
 
@@ -30,7 +34,7 @@
 
 # example: Ông ăn
 #04:name	([\p{Lu}][\p{L}&&[^\p{Lu}]]*)([\s+\-][\p{Lu}][\p{L}&&[^\p{Lu}]]*)*
-04:name	([\p{Lu}][\p{L}]*)([\s+\-][\p{Lu}][\p{L}]*)*
+#04:name	([\p{Lu}][\p{L}]*)([\s+\-][\p{Lu}][\p{L}]*)*
 # below is the RE module version
 # 04:name	^([A-ZÁÀẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÍÌỈĨỊÝỲỶỸỴÓÒỎÕỌÔỐỒỔỠỢÚÙỦŨỤƯỨỪỬỮỰ][a-záàảãạăắằẳãặâấầẩẫậđéèẻẽẹêếềểẽệíìỉĩịýỳỷỹỵóòỏõọôốồổỗợơớờởỡộúùủũụưứừửữự]*)([\s+\-][A-ZÁÀẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÍÌỈĨỊÝỲỶỸỴÓÒỎÕỌÔỐỒỔỠỢÚÙỦŨỤƯỨỪỬỮỰ][a-záàảãạăắằẳãặâấầẩẫậđéèẻẽẹêếềểẽệíìỉĩịýỳỷỹỵóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữự]+)*
 
@@ -64,7 +68,7 @@
 16:space	\s+
 17:special	[`~\!@#\$%\^&\*\-_\+=\|\\\:;"',<\.>/\?]+
 
-18:foreign	([\p{L}]+)(\-[\p{L}]+)*
+#18:foreign	([\p{L}]+)(\-[\p{L}]+)*
 
 19:domain	\w[\-\w]*\w+(\.\w+)+
 20:url	((http)|(https)|(ftp))\://\w[\-\w]*\w+(\.\w+)+(/[\w\-]+)*(\.\w+)?