-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.py
115 lines (94 loc) · 3.27 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import logging
import os
import pickle
from query import Query
from time import time
from collections import defaultdict
logging.basicConfig(level="INFO")
class Index:
def __init__(self, root=None):
self.index_file = "index.pkl"
if os.path.exists(self.index_file) and root is None:
with open(self.index_file, "rb") as fp:
self.index, self.files = pickle.load(fp)
self.fileid = len(self.files)
else:
self.index = defaultdict(set)
self.files = []
self.fileid = 0
if root is None:
logging.error('Error give file name')
quit()
else:
self.build(root)
def build(self, root):
logging.info("index creation starting")
start_time = time()
for root, _, files in os.walk(root):
for file in files:
file = os.path.join(root, file)
self.add_file(file)
logging.info("index created %s", str(time() - start_time))
with open(self.index_file, "wb") as fp:
pickle.dump((self.index, self.files), fp)
logging.info("index written to disk %s", str(time() - start_time))
def add_file(self, file):
try:
with open(file, "r") as fp:
doc = fp.read()
prev = doc[:3]
if len(prev) == 3:
self.index[prev].add(self.fileid)
for char in doc[3:]:
prev = prev[1:] + char
self.index[prev].add(self.fileid)
self.fileid += 1
self.files.append(file)
except UnicodeDecodeError:
# logging.info("Couldn't read", file)
pass
# returns the index
def read(self):
return self.index
# returns intersection of posting lists
@staticmethod
def list_and(lst1, lst2):
return lst1 & lst2
# returns union of posting lists
@staticmethod
def list_or(lst1, lst2):
return lst1 | lst2
# returns candidate file ids
def get_candidate_fileids(self, q):
candid = None
if q.op == Query.QNone:
return set()
if q.op == Query.QAll:
return set(range(len(self.files)))
if q.op == Query.QAnd:
candid = None
for t in q.trigram:
candid = self.index[t]
break
for t in q.trigram:
candid = self.list_and(candid, self.index[t])
if candid is None:
for s in q.sub:
candid = self.get_candidate_fileids(s)
for s in q.sub:
candid = self.list_and(candid, self.get_candidate_fileids(s))
if q.op == Query.QOr:
candid = set()
for t in q.trigram:
candid = self.list_or(candid, self.index[t])
for s in q.sub:
candid = self.list_or(candid, self.get_candidate_fileids(s))
return set() if candid is None else candid
def get_filenames(self, fileids):
return list(map(lambda x: self.files[x], fileids))
def get_filecount(self):
return len(self.files)
if __name__ == "__main__":
index = Index()
index.build("gcs")
logging.info(index.read())