diff --git a/pyVitk/Lexicon.py b/pyVitk/Lexicon.py index f024a18..4225f96 100644 --- a/pyVitk/Lexicon.py +++ b/pyVitk/Lexicon.py @@ -68,7 +68,7 @@ class Lexicon(object): """ The tokenizer lexicon dictionary """ - def __init__(self, default=True, case_sensitive=True): + def __init__(self, default=True, case_sensitive=False): """ The initializer of lexicon :param default: True to load default defined lexicon xml file. diff --git a/pyVitk/Tokenizer.py b/pyVitk/Tokenizer.py index a3c1f23..381fe92 100644 --- a/pyVitk/Tokenizer.py +++ b/pyVitk/Tokenizer.py @@ -256,7 +256,7 @@ def segment(self, sentence: str, concat=False) -> list: string_left_trimmed = s[max_matched_len:].lstrip() # process the token we found - if 'name' in token_type and len(string_left_trimmed) > 0: + if 'name' in token_type and len(string_left_trimmed) > 0: # we have dropped the name regex pattern. tup = self.processName(next_token, string_left_trimmed) if len(tup[0]) != len(next_token): next_token = tup[0] @@ -420,8 +420,8 @@ class Tokenizer(object): """Vietnamese Tokenizer """ - def __init__(self, lexicon_src = None, regexp_src = None, bigramFilename = None, - whilespaceModelFilename=None): + def __init__(self, lexicon_src=None, regexp_src=None, bigramFilename=None, + whilespaceModelFilename=None, case_sensitive=False): """Construct the tokenizer Parameters @@ -438,19 +438,19 @@ def __init__(self, lexicon_src = None, regexp_src = None, bigramFilename = None, this_dir, this_filename = os.path.split(__file__) # use default files if not provided - if not lexicon_src: - lexicon_src = os.path.join(this_dir, 'dat/tok/lexicon.xml') if not regexp_src: regexp_src = os.path.join(this_dir, 'dat/tok/regexp-py.txt') if not bigramFilename: bigramFilename = os.path.join(this_dir, 'dat/tok/syllables2M.arpa') - - self.lexicon = Lexicon() - if type(lexicon_src) is str: - self.lexicon.load(lexicon_src) + if lexicon_src: + self.lexicon = Lexicon(default=False, case_sensitive=case_sensitive) + if type(lexicon_src) is str: + self.lexicon.load(lexicon_src) + else: + self.lexicon.loadFromList(lexicon_src) else: - self.lexicon.loadFromList(lexicon_src) + self.lexicon = Lexicon(case_sensitive=case_sensitive) self.classifier = None self.graph = PhraseGraph(self) diff --git a/pyVitk/dat/tok/regexp-py.txt b/pyVitk/dat/tok/regexp-py.txt index 7b8a3e5..349ba41 100644 --- a/pyVitk/dat/tok/regexp-py.txt +++ b/pyVitk/dat/tok/regexp-py.txt @@ -1,6 +1,11 @@ # example: i am a lowercase string. -01:phrase [\p{Lu}]{0,1}[\p{Ll}]+(?:\s[\p{Ll}]+)*(?=[\s\.,\!\?\-\:;"“”'\(\)*]+|$)(?