Skip to content

Commit

Permalink
drop nltk trie imp in lexicon and add case-insensitive search support
Browse files Browse the repository at this point in the history
  • Loading branch information
u8621011 committed Aug 8, 2018
1 parent f623387 commit c216292
Show file tree
Hide file tree
Showing 6 changed files with 41,219 additions and 88 deletions.
121 changes: 44 additions & 77 deletions pyVitk/Lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@

import xml.etree.ElementTree as etree
import sys
from nltk.collections import Trie
from datetime import datetime, timedelta
import logging

logger = logging.getLogger(__name__)

class Node(object):
"""
The trie node structure. c = '_' for root node, c='*' for leaf node.
"""
def __init__(self, c):
self.c = c
self.children = []
Expand Down Expand Up @@ -51,7 +53,7 @@ def insertWord(self, s: str, pos: int):

def hasWord(self, s: str, pos: int) -> bool:
if pos == len(s):
for j in range(0 , len(self.children)):
for j in range(0, len(self.children)):
if self.children[j].c == '*':
return True
return False
Expand All @@ -63,15 +65,28 @@ def hasWord(self, s: str, pos: int) -> bool:


class Lexicon(object):
"""Load and build prefix tree from lexcion.xml"""
def __init__(self):
# the original trie implementation
# self.root = Node('_')
self.root = None
"""
The tokenizer lexicon dictionary
"""
def __init__(self, default=True, case_sensitive=True):
"""
The initializer of lexicon
:param default: True to load default defined lexicon xml file.
:param case_sensitive:
"""
self.numNodes = 0
self.case_sensitive = case_sensitive
if default:
import os
this_dir, this_filename = os.path.split(__file__)
if case_sensitive:
lexicon_src = os.path.join(this_dir, 'dat/tok/lexicon.xml')
else:
lexicon_src = os.path.join(this_dir, 'dat/tok/lexicon-insensitive.xml')
self.load(lexicon_src)
else:
self.root = Node('_')

#self.trie = Trie()
self.trie = None
def load(self, filename: str):
xml = etree.parse(filename)
n = xml.getroot()
Expand All @@ -85,10 +100,10 @@ def loadFromList(self, lex_list):
lex_list: list of str
"""
self.trie = Trie()
self.root = Node('_') # the root node.

for item in lex_list:
self.trie.insert(item)
self.insertWord(item)

def loadNode (self, n):
node = Node(n.attrib['c'])
Expand All @@ -100,57 +115,32 @@ def loadNode (self, n):
return node

def insertWord(self, word: str):
if self.root:
if self.case_sensitive:
self.root.insertWord(word, 0)
elif self.trie:
self.trie.insert(word)
else:
raise ValueError('Dunno the type of trie tree structure.')
self.root.insertWord(word.lower(), 0)

def hasWord(self, word: str) -> bool:
if self.root:
return self.hasWordOrg(word)
elif self.trie:
return self.hasWordTrie(word)
else:
raise ValueError('Lexicon Dictionary is not be initialized')

def hasWordOrg(self, word: str) -> bool:
return self.root.hasWord(word, 0)

def hasWordTrie(self, text):
if self.trie:
n = len(text)
curTrie = self.trie
for i in range(n):
if text[i] in curTrie:
curTrie = curTrie[text[i]]
else:
return False
if Trie.LEAF in curTrie:
return True

if self.case_sensitive:
return self.root.hasWord(word, 0)
else:
return False
return self.root.hasWord(word.lower(), 0)

def serialize_to_xml(self, ofile: str):
if self.root:
# build ElementTree from root structure
et_root = etree.Element('n', attrib={'c': '_'})
if len(self.root.children) > 0:
for child in self.root.children:
if child.c == '*':
etree.SubElement(et_root, 'n', {'c': '*'})
else:
et_child = etree.SubElement(et_root, 'n', {'c': child.c})
self.build_etree_from_node(et_child, child)
else:
etree.SubElement(et_root, 'n', {'c': '*'})

tree = etree.ElementTree(et_root)
tree.write(ofile, encoding='utf-8', xml_declaration=True)
# build ElementTree from root structure
et_root = etree.Element('n', attrib={'c': '_'})
if len(self.root.children) > 0:
for child in self.root.children:
if child.c == '*':
etree.SubElement(et_root, 'n', {'c': '*'})
else:
et_child = etree.SubElement(et_root, 'n', {'c': child.c})
self.build_etree_from_node(et_child, child)
else:
raise NotImplementedError
etree.SubElement(et_root, 'n', {'c': '*'})

tree = etree.ElementTree(et_root)
tree.write(ofile, encoding='utf-8', xml_declaration=True)

def build_etree_from_node(self, et_parent, node_parent):
if node_parent.children is not None and len(node_parent.children) > 0:
Expand Down Expand Up @@ -188,26 +178,3 @@ def flttenRecursive(self, fwrite, nodeHead: Node, charList: list):
self.flttenRecursive(fwrite, child, charList)
charList.pop()

if __name__ == '__main__':
cmd = sys.argv[1]

if cmd == 'load':
n = datetime.now()
l = Lexicon()
l.load(sys.argv[2])
d = datetime.now() - n
logger.debug('timedelta to run: {}'.format(d))
elif cmd == 'flat':
l = Lexicon()
l.flattenToFile(sys.argv[2])
elif cmd == 'loadflat':
n = datetime.now()
with open(sys.argv[2], encoding='utf8') as f:
l = Lexicon()
for line in f:
if len(line) > 0:
l.trie.insert(line)

d = datetime.now() - n
logger.debug('timedelta to run: {}'.format(d))

2 changes: 2 additions & 0 deletions pyVitk/dat/tok/lexicon-insensitive.xml

Large diffs are not rendered by default.

File renamed without changes.
58 changes: 47 additions & 11 deletions pyVitk/test/LexiconTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,65 @@

class LexiconTestCase(unittest.TestCase):
def setUp(self):
self.lexicon = Lexicon.Lexicon()
self.lexicon.load('../dat/tok/lexicon.xml')
pass

def tearDown(self):
self.lexicon = None
pass

def test_has_word_sensitive(self):
lexicon = Lexicon.Lexicon(case_sensitive=True)
r = lexicon.hasWord('ai hoài')
self.assertTrue(r)

def test_has_word(self):
r = self.lexicon.hasWord('ai hoài')
r = lexicon.hasWord('ngoại bang')
self.assertTrue(r)

r = self.lexicon.hasWord('ngoại bang')
r = lexicon.hasWord('ngoại giao nhân dân')
self.assertTrue(r)

r = self.lexicon.hasWord('ngoại giao nhân dân')
r = lexicon.hasWord('dự án')
self.assertTrue(r)

r = self.lexicon.hasWord('dự án')
r = lexicon.hasWord('Dự án')
self.assertFalse(r)

r = lexicon.hasWord('phiên bản')
self.assertTrue(r)

r = self.lexicon.hasWord('phiên bản')
r = lexicon.hasWord('ai hoài')
self.assertTrue(r)

r = self.lexicon.hasWord('ai Hoài')
r = lexicon.hasWord('ai Hoài')
self.assertFalse(r)

r = self.lexicon.hasWord('tiếng việt')
r = lexicon.hasWord('tiếng việt')
self.assertFalse(r)

def test_has_word_insensitive(self):
lexicon = Lexicon.Lexicon(case_sensitive=False)
r = lexicon.hasWord('ai hoài')
self.assertTrue(r)

r = lexicon.hasWord('ngoại bang')
self.assertTrue(r)

r = lexicon.hasWord('ngoại giao nhân dân')
self.assertTrue(r)

r = lexicon.hasWord('dự án')
self.assertTrue(r)

r = lexicon.hasWord('Dự án')
self.assertTrue(r)

r = lexicon.hasWord('phiên bản')
self.assertTrue(r)

r = lexicon.hasWord('ai hoài')
self.assertTrue(r)

r = lexicon.hasWord('ai Hoài')
self.assertTrue(r)

r = lexicon.hasWord('tiếng việt')
self.assertFalse(r)
42 changes: 42 additions & 0 deletions pyVitk/tool/LexiconTools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import sys
import logging
from datetime import datetime
from ..Lexicon import Lexicon
from ..helper import setup_logging


setup_logging()
logger = logging.getLogger(__name__)

cmd = sys.argv[1]
if cmd == 'flat':
l = Lexicon()
l.flattenToFile(sys.argv[2])
elif cmd == 'loadflat':
n = datetime.now()
with open(sys.argv[2], encoding='utf8') as f:
l = Lexicon()
for line in f:
if len(line) > 0:
l.insertWord(line)

d = datetime.now() - n
logger.debug('timedelta to run: {}'.format(d))
elif cmd == "create_insensitive_xml": # create lexicon-insensitive.xml
import os

this_dir, this_filename = os.path.split(__file__)
lexicon_src = os.path.join(this_dir, 'lexicon.txt')
lexicon_xml = os.path.join(this_dir, 'lexicon-insensitive.xml')

l = Lexicon(default=False, case_sensitive=False)
f_lex = open(lexicon_src, mode='r', encoding='utf-8')

logger.info('Building trie tree from lexicon.txt')
lexes = [lex.strip() for lex in f_lex.readlines()]
for lex in lexes:
if len(lex) > 0:
l.insertWord(lex)

logger.info('Write to lexicon-insensitive.xml file')
l.serialize_to_xml(lexicon_xml)
Loading

0 comments on commit c216292

Please sign in to comment.