-
Notifications
You must be signed in to change notification settings - Fork 5
/
ftrie.py
123 lines (104 loc) · 4.26 KB
/
ftrie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Copyright 2007-2013 Facundo Batista
# Code adapted from original after post
# http://www.taniquetil.com.ar/plog/post/1/310
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 3, as published
# by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranties of
# MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR
# PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
#
# For further info, check https://launchpad.net/ftrie
"""A Fucked Trie, or Fluorescent Trie as it's more polite. A Word Tree."""
class WordTree(object):
"""A "trie" of words that carry a payload.
It's loaded at init, must receive a dict where the 'words' are
keys, and the 'payloads' the values.
The words must be unicode (this is validated at building time).
"""
def __init__(self, wordsdata):
self.data = self._fill(sorted(wordsdata.iteritems()))
def _fill(self, wordsdata):
"""Fill the tree."""
data = {}
# first filter the one that ends here (if any)
for i, (word, payload) in enumerate(wordsdata):
if word == u"":
data[None] = payload
del wordsdata[i] # change what iterating, don't care
break # because we exit the loop
firstword, firstpayload = wordsdata[0]
assert isinstance(firstword, unicode), "All words must be unicode"
prvprim = firstword[0]
all_rests = [(firstword[1:], firstpayload)]
morethanone = False
for word, payload in wordsdata[1:]:
assert isinstance(word, unicode), "All words must be unicode"
prim, rest = word[0], word[1:]
if prim == prvprim:
all_rests.append((rest, payload))
morethanone = True
continue
if morethanone:
data[prvprim] = self._fill(all_rests)
else:
data[prvprim] = all_rests[0] # end of branch, a tuple
prvprim = prim
all_rests = [(rest, payload)]
morethanone = False
if morethanone:
data[prvprim] = self._fill(all_rests)
else:
data[prvprim] = all_rests[0] # end of branch, a tuple
return data
def search(self, tosearch):
"""Search the tree.
It returns a list of (word, payload) where is word has as prefix the
searched token.
"""
partial = []
dic = self.data
for char in tosearch:
# dic is string in some "raw branches at the end"
if isinstance(dic, tuple):
wordend, payload = dic
r = u"".join(partial) + wordend
if r == tosearch:
return [(r, payload)]
else:
return []
try:
dic = dic[char]
except KeyError:
# failed to find the next character in the search, so the
# search is not prefix of anything stored
return []
# go on! store the found character
partial.append(char)
if isinstance(dic, tuple):
# the remaining stuff is a raw branch, we're done
wordend, payload = dic
r = u"".join(partial) + wordend
return [(r, payload)]
# at this point we know that the searched token is a prefix of
# several stuff, let's build the result
result = []
def build(dic, base):
"""Build the different result alternatives."""
for char, nextdic in dic.items():
if char is None:
result.append((base, nextdic))
continue
if isinstance(nextdic, tuple):
wordend, payload = nextdic
result.append((base + char + wordend, payload))
else:
build(nextdic, base + char)
build(dic, tosearch)
return result