-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsearch_storage.py
executable file
·199 lines (161 loc) · 5.45 KB
/
search_storage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python
"""
Index files in subfolder
"""
import os, stat
import subprocess
import socket
import logging
from crc32 import crc32
from storage import Storage
TIMESTMP_FMT = ("%H:%M:%S") # no %f for msecs
FORMAT = '%(asctime)s.%(msecs)03d %(name)-5s %(message)s'
logging.basicConfig(format=FORMAT, datefmt=TIMESTMP_FMT, level=logging.INFO)
VISIT_LOG = open('visit.log', 'w', buffering=0)
def sha1(name):
"""
Does not belong here
1.4GB in 3.8 sec
"""
ret = subprocess.check_output(["/usr/bin/sha1sum", name],
stderr=subprocess.STDOUT).strip()
return str(ret).split(' ')[0]
class FindFiles:
"""
Class FindFiles
"""
def __init__(self, dbs):
self.problem_files = []
self.log = logging.getLogger("FindFiles")
self.dbs = dbs
def visit_inode(self, _stat, full_name):
"""
Check if inode is already indexed, or mtime has changed
otherwise calculate sha1 and update
"""
log = self.log
dbs = self.dbs
hard_link = dbs.lookup_inode(_stat.st_dev, _stat.st_ino)
if hard_link:
mtime = hard_link['st_mtime']
if mtime == _stat.st_mtime:
log.debug("hard link %s mtime %r valid, skip...", full_name,
mtime)
return
else:
log.info("hard link %s mtime %r changed, reindex...", full_name,
mtime)
try:
_sha1 = "no-sha1" #sha1(full_name)
_crc32 = crc32(full_name)
# stat.n_link
dbs.add_inode(_stat.st_dev, _stat.st_ino, _stat.st_mtime,
_stat.st_size, _crc32, _sha1)
return
except subprocess.CalledProcessError:
print "problem file", full_name
self.problem_files.append(full_name)
def visit(self, full_name):
"""
Check regular file and add to database
"""
log = self.log
dbs = self.dbs
if os.path.islink(full_name):
# if link is invalid os.stat fails
log.info("ignore symbolic link: %s", full_name)
return
_stat = os.stat(full_name)
if stat.S_ISSOCK(_stat.st_mode):
log.info("ignore socket: %s", full_name)
return
elif stat.S_ISCHR(_stat.st_mode):
log.info("ignore character special device file: %s", full_name)
return
elif stat.S_ISBLK(_stat.st_mode):
log.info("ignore block special device file: %s", full_name)
return
elif stat.S_ISFIFO(_stat.st_mode):
log.info("ignore FIFO (named pipe): %s", full_name)
return
elif stat.S_ISLNK(_stat.st_mode):
log.debug("ignore symbolic link: %s", full_name)
return
assert stat.S_ISREG(_stat.st_mode)
try:
self.visit_inode(_stat, full_name)
dbs.add_file(full_name, _stat.st_dev, _stat.st_ino)
except subprocess.CalledProcessError:
print "problem file", full_name
self.problem_files.append(full_name)
def search(self, path):
"""
Search folder
"""
for root, dirs, files in os.walk(path):
if '.git' in dirs:
dirs.remove('.git')
if '.svn' in dirs:
dirs.remove('.svn')
for fna in files:
VISIT_LOG.write(("%s/%s\n" % (root, fna)))
full_name = os.path.join(root, fna)
self.visit(full_name)
# TODO return PROBLEM_FILES
def print_duplicates(dbs):
"""
Print duplicates with filename
"""
for row in dbs.duplicates():
print row
def unit_test():
"""
Common errors, special files
ff
"""
sha1('/etc/passwd')
try:
sha1('000_unit_test_not_exist')
except subprocess.CalledProcessError, err:
print err
dbm = Storage(memory=False)
#dbm.recreate()
_iter = FindFiles(dbm)
try:
os.unlink('test-files/test-socket')
except OSError:
pass
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
sock.bind('test-files/test-socket')
_iter.search('./test-files')
sock.close()
os.unlink('test-files/test-socket')
os.system("echo a > test-files/hardlink1.txt")
os.system("ln -f test-files/hardlink1.txt test-files/hardlink2.txt")
os.system("ln -f test-files/hardlink1.txt test-files/hardlink3.txt")
os.system("cp -f test-files/hardlink1.txt test-files/copy1.txt")
os.system("cp -f test-files/hardlink1.txt test-files/copy2.txt")
os.system("cp -f test-files/hardlink1.txt test-files/copy3.txt")
_iter.search('./test-files')
print "\nduplicate keys:"
print_duplicates(dbm)
os.system("echo a > test-files/crc32-test.txt")
_crc32 = crc32("test-files/crc32-test.txt")
os.unlink("test-files/crc32-test.txt")
print "crc32 %d" % _crc32
if __name__ == "__main__":
unit_test()
#os.abort()
DB2 = Storage(memory=False)
#DB2.recreate() # TODO
__ITER__ = FindFiles(DB2)
DB2.begin_adding_files()
#PROBLEM_FILES = __ITER__.search('/home/afenkart')
PROBLEM_FILES = __ITER__.search('test-files')
# TODO, do not commit every insertion, but do not rollback everything
DB2.done_adding_files()
print "\nduplicate keys:"
print_duplicates(DB2)
if PROBLEM_FILES:
print "\nproblem files:"
print "\n".join(PROBLEM_FILES)