Skip to content

Commit

Permalink
add case insensitive support and tokenization pos information
Browse files Browse the repository at this point in the history
  • Loading branch information
u8621011 committed Aug 8, 2018
1 parent 0331861 commit 729d7c6
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 15 deletions.
2 changes: 1 addition & 1 deletion pyVitk/Lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class Lexicon(object):
"""
The tokenizer lexicon dictionary
"""
def __init__(self, default=True, case_sensitive=True):
def __init__(self, default=True, case_sensitive=False):
"""
The initializer of lexicon
:param default: True to load default defined lexicon xml file.
Expand Down
20 changes: 10 additions & 10 deletions pyVitk/Tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def segment(self, sentence: str, concat=False) -> list:
string_left_trimmed = s[max_matched_len:].lstrip()

# process the token we found
if 'name' in token_type and len(string_left_trimmed) > 0:
if 'name' in token_type and len(string_left_trimmed) > 0: # we have dropped the name regex pattern.
tup = self.processName(next_token, string_left_trimmed)
if len(tup[0]) != len(next_token):
next_token = tup[0]
Expand Down Expand Up @@ -420,8 +420,8 @@ class Tokenizer(object):
"""Vietnamese Tokenizer
"""

def __init__(self, lexicon_src = None, regexp_src = None, bigramFilename = None,
whilespaceModelFilename=None):
def __init__(self, lexicon_src=None, regexp_src=None, bigramFilename=None,
whilespaceModelFilename=None, case_sensitive=False):
"""Construct the tokenizer
Parameters
Expand All @@ -438,19 +438,19 @@ def __init__(self, lexicon_src = None, regexp_src = None, bigramFilename = None,
this_dir, this_filename = os.path.split(__file__)

# use default files if not provided
if not lexicon_src:
lexicon_src = os.path.join(this_dir, 'dat/tok/lexicon.xml')
if not regexp_src:
regexp_src = os.path.join(this_dir, 'dat/tok/regexp-py.txt')
if not bigramFilename:
bigramFilename = os.path.join(this_dir, 'dat/tok/syllables2M.arpa')

self.lexicon = Lexicon()

if type(lexicon_src) is str:
self.lexicon.load(lexicon_src)
if lexicon_src:
self.lexicon = Lexicon(default=False, case_sensitive=case_sensitive)
if type(lexicon_src) is str:
self.lexicon.load(lexicon_src)
else:
self.lexicon.loadFromList(lexicon_src)
else:
self.lexicon.loadFromList(lexicon_src)
self.lexicon = Lexicon(case_sensitive=case_sensitive)

self.classifier = None
self.graph = PhraseGraph(self)
Expand Down
14 changes: 11 additions & 3 deletions pyVitk/dat/tok/regexp-py.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
# example: i am a lowercase string.
01:phrase [\p{Lu}]{0,1}[\p{Ll}]+(?:\s[\p{Ll}]+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))
#01:phrase [\p{Ll}]+(?:\s[\p{Ll}]+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp)) # fix for tailing space error
01:phrase \w+(?:\s\w+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))

# for begin char upper case allowed.
#01:phrase [\p{Lu}]{0,1}[\p{Ll}]+(?:\s[\p{Ll}]+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))

# fix for tailing space error
#01:phrase [\p{Ll}]+(?:\s[\p{Ll}]+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))
#01:phrase [\p{Ll}\s]+(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?<!(http|https|ftp))
# 01:phrase [aáàảãạăắằẳẵặâấầẩẫậbcdđeéèẻẽẹêếềểễệfghiíìỉĩịjklmnoóòỏõọôốồổỗộơớờởỡợpqrstuúùủũụưứừửữựvwxyýỳỷỹỵz\s]+(?=[\s\.,\!\?\-\:;"“”'\(\)]+|$)(?<!http)(?<!https)<?<!ftp)
# the original pattern for java won't match string likes http:\\www.google.com, but python version will match and capture string except http/https/ftp string. may be we should adjust the algorithm to do special pattern search before phrase patten.
# 01:phrase [aáàảãạăắằẳẵặâấầẩẫậbcdđeéèẻẽẹêếềểễệfghiíìỉĩịjklmnoóòỏõọôốồổỗộơớờởỡợpqrstuúùủũụưứừửữựvwxyýỳỷỹỵz\s]+(?=[\s\.,\!\?\-\:;"“”'\(\)]+|$)

# example: AAA.A
#02:allcaps ([A-Z]+)([A-Z\.])*[^a-z\)\W]
02:allcaps ([\p{Lu}]+)([\p{Lu}\.])*[^\p{Ll}\)]
#02:allcaps ([\p{Lu}]+)([\p{Lu}\.])*[^\p{Ll}\)]

# fix the space tailing problem
02:allcaps ([\p{Lu}]+)([\p{Lu}\.])*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)

# complete latin char [a-zA-Z]
# example: 29292ADB222
Expand Down
15 changes: 15 additions & 0 deletions pyVitk/test/logging.json → pyVitk/logging.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,16 @@
"stream": "ext://sys.stdout"
},

"pyVity_file_handler": {
"class": "logging.handlers.RotatingFileHandler",
"level": "DEBUG",
"formatter": "simple",
"filename": "pyVitk.log",
"maxBytes": 10485760,
"backupCount": 20,
"encoding": "utf8"
},

"rotating_file_handler": {
"class": "logging.handlers.RotatingFileHandler",
"level": "DEBUG",
Expand All @@ -37,6 +47,11 @@
},

"loggers": {
"pyVitk": {
"level": "DEBUG",
"handlers": ["console", "pyVity_file_handler"],
"propagate": "yes"
},
"unittest": {
"level": "DEBUG",
"handlers": ["console", "unittest_file_handler"],
Expand Down
20 changes: 19 additions & 1 deletion pyVitk/test/TokenizerTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class TokenzierTestCase(TestCase):
@classmethod
def setUpClass(cls):
logger.debug('Start TokenizerTest')
cls.tokenizer = Tokenizer()
cls.tokenizer = Tokenizer(case_sensitive=True)
cls.tokenizer_insensitive = Tokenizer(case_sensitive=False)

@classmethod
def tearDownClass(cls):
Expand Down Expand Up @@ -160,5 +161,22 @@ def test_tokenize_line9(self):
logger.debug('Test tokenizeline, s: ' + s)
logger.debug('Tokenized Result: {}'.format(t))

t = self.tokenizer_insensitive.tokenizeLine(s)
logger.debug('Test insensitive tokenizeline, s: ' + s)
logger.debug('Tokenized Result: {}'.format(t))

def test_tokenize_line10(self):
s = "PHD | Xem Đi Xem Lại Cả 1000 Lần Mà Vẫn Không Thể Nhịn Được Cười | Funny Videos,"
t = self.tokenizer.tokenizeLine(s)
logger.debug('Test tokenizeline, s: ' + s)
logger.debug('Tokenized Result: {}'.format(t))

def test_tokenize_line11(self):
s = "BÍCH PHƯƠNG - Bao Giờ Lấy Chồng? [OFFICIAL M/V]"
t = self.tokenizer.tokenizeLine(s)
logger.debug('Test tokenizeline, s: ' + s)
logger.debug('Tokenized Result: {}'.format(t))


def test_serialize_to_xml(self):
self.tokenizer.to_lexicon_xml_file('test.xml')

0 comments on commit 729d7c6

Please sign in to comment.