-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_duplicates.py
executable file
·373 lines (307 loc) · 14.1 KB
/
find_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
#!/bin/python3
import argparse
import os
import hashlib
import logging
import math
import pickle
import json
import time
from datetime import timedelta
def migrate_database():
if not os.path.exists(hash_file_path):
if os.path.exists(hash_file_path_legacy):
migrate = ""
while migrate != "y" and migrate != "n":
migrate = input("Unmigrated pickle file found. Do you wish to migrate it? (y/n): ").lower()
if migrate == "n":
print("Support for the old cache format has been dropped. Migration is necessary.")
exit()
legacy_data = load_hashes_legacy(hash_file_path_legacy)
changed_data = dict()
# json cannot use tuples as keys so we use a string
for key in legacy_data:
basename, filesize = key
new_key = f"{filesize} {basename}"
changed_data[new_key] = legacy_data[key]
# previously, only sha1 was supported, so it follows that all hashes in the pickle file are sha1 hashes
data = {"sha1": changed_data}
save_hashes(hash_file_path, data)
print("renaming legacy pickle file")
os.rename(hash_file_path_legacy, os.path.join(directory, "hashes_legacy.p"))
def main():
parser = argparse.ArgumentParser(description='Find duplicate files in destination of files in source,'+
'delete destination file and replace it with a link originating from the corresponding source file.')
parser.add_argument('source', nargs='+', help='One or more paths to the source file or folder (the original location(s) of the files to be linked).')
parser.add_argument('destination', help='Path to the destination file or folder (where symbolic links will be created, pointing to the source).')
parser.add_argument('--softlink', dest='softlink', action='store_true',
help='Create softlinks. Without specifying soft- or hardlinks the script does just a dry-run.')
parser.add_argument('--hardlink', dest='hardlink', action='store_true',
help='Create hardlinks. Without specifying soft- or hardlinks the script does just a dry-run.')
parser.add_argument('--follow-symlinks', dest="follow_symlinks", action='store_true',
help='Set this to follow symlinks, can result in redundant work or problems. Default: false')
parser.add_argument('--no-cache', dest="no_cache", action="store_true",
help="Deactivate caching based on filename-filesize combination. Caching can cause problems, but improves speed immensely for repeated executions")
parser.add_argument('--no-source-cache', dest="no_source_cache", action="store_true",
help="Deactivate caching based on filename-filesize combination for the source directory.")
parser.add_argument('--no-destination-cache', dest="no_destination_cache", action="store_true",
help="Deactivate caching based on filename-filesize combination for the destination directory.")
parser.add_argument('--print-hashes', dest="print_hashes", action="store_true",
help="Prints list of all files for debugging purposes.")
group = parser.add_mutually_exclusive_group()
hash_function = None
group.add_argument('--sha1', dest=hash_function, action='store_const', const=hashlib.sha1, help='Use sha1 hashes (default)')
group.add_argument('--sha256', dest=hash_function, action='store_const', const=hashlib.sha256, help='Use sha256 hashes')
group.add_argument('--sha512', dest=hash_function, action='store_const', const=hashlib.sha512, help='Use sha512S hashes')
group.add_argument('--md5', dest=hash_function, action='store_const', const=hashlib.md5, help='Use md5 hashes')
parser.add_argument(
'-d', '--debug',
help="Print lots of debugging statements",
action="store_const", dest="loglevel", const=logging.DEBUG,
default=logging.WARNING,
)
parser.add_argument(
'-v', '--verbose',
help="Be verbose",
action="store_const", dest="loglevel", const=logging.INFO,
)
args = parser.parse_args()
logging.basicConfig(level=args.loglevel)
follow_symlinks = args.follow_symlinks
softlink = args.softlink
hardlink = args.hardlink
if hash_function is None:
hash_function = hashlib.sha1
hash_function_name = hash_function.__name__[8:]
if not hash_function_name in hashes:
hashes[hash_function_name] = dict()
if softlink and hardlink:
print("Cannot create soft- and hardlinks at the same time. Choose one.")
exit()
use_source_cache = (not args.no_cache) and (not args.no_source_cache)
use_destination_cache = (not args.no_cache) and (not args.no_destination_cache)
print("-"*40)
print("Use Cache: "+str(not args.no_cache))
print("Use source cache: "+str(use_source_cache))
print("Use destination cache: "+str(use_destination_cache))
print("-"*40)
source_files = get_all_files(args.source)
destination_files = get_all_files(args.destination)
source_hashes = dict() # hash: abspath
destination_hashes = dict() # hash: abspath
source_files_unhashed_size = sum(map(lambda x: safe_file_size(x), filter(lambda x: (not has_file_hash(x, hash_function_name)) or (not use_source_cache), source_files)))
source_files_hashed_size = 0
destination_files_unhashed_size = sum(map(lambda x: safe_file_size(x), filter(lambda x: (not has_file_hash(x, hash_function_name)) or (not use_destination_cache), destination_files)))
destination_files_hashed_size = 0
start_time = time.time()
for sf in source_files:
if not os.path.islink(sf) or follow_symlinks:
add = (not has_file_hash(sf, hash_function_name)) or (not use_source_cache)
hash = get_file_hash(sf, hash_function, use_source_cache)
if add:
source_files_hashed_size += safe_file_size(sf)
time_used = (time.time() - start_time)
time_left = (time_used / (source_files_hashed_size / source_files_unhashed_size)) - time_used
print("Source-file hashed so far: " + ("%.2f"% (source_files_hashed_size/float(10**9) ) ) +
"/" + ("%.2f"% (source_files_unhashed_size/float(10**9) ) ) +
" GB (" + ("%.2f"% (source_files_hashed_size*100/source_files_unhashed_size) ) + "%)"+
" avg. Speed: "+("%.2f"% (source_files_hashed_size / time_used / float(10**6))) + " MB/s"
" ETA: "+calculate_elapsed_time(time_left))
if hash is not None:
source_hashes[hash] = os.path.abspath(sf)
start_time = time.time()
for df in destination_files:
if not os.path.islink(df) or follow_symlinks:
add = (not has_file_hash(df, hash_function_name)) or (not use_destination_cache)
hash = get_file_hash(df, hash_function, use_destination_cache)
if add:
destination_files_hashed_size += safe_file_size(sf)
time_used = (time.time() - start_time)
time_left = (time_used / (destination_files_hashed_size / destination_files_unhashed_size)) - time_used
print("Destination-file hashed so far: " + ("%.2f"% (destination_files_hashed_size/float(10**9) ) ) +
"/" + ("%.2f"% (destination_files_unhashed_size/float(10**9) ) ) +
" GB (" + ("%.2f"% (destination_files_hashed_size*100/destination_files_unhashed_size) ) + "%)"+
" avg. Speed: "+("%.2f"% (destination_files_hashed_size / time_used / float(10**6))) + " MB/s"
" ETA: "+calculate_elapsed_time(time_left))
if hash is not None:
destination_hashes[hash] = os.path.abspath(df)
matches = []
comm_match_filesize = 0
if (args.print_hashes):
print("-------------------")
print("Source files: ")
for sh in source_hashes:
print(" "+str(source_hashes[sh])+":\n "+str(sh)+"\n "+str(os.path.getsize(source_hashes[sh])))
print("-------------------")
print("Destination files: ")
for dh in destination_hashes:
print(" "+str(destination_hashes[dh])+":\n "+str(dh)+"\n "+str(os.path.getsize(destination_hashes[dh])))
timestamp_before_compare = time.time()
for sh in source_hashes:
for dh in destination_hashes:
if sh == dh:
file_size = os.path.getsize(source_hashes[sh])
comm_match_filesize += file_size
print("Match found: " + ("%.2f"% (file_size/float(10**9) ) ) + " GB")
print(" '"+source_hashes[sh]+"'")
print("--->'"+destination_hashes[dh]+"'")
matches.append((source_hashes[sh], destination_hashes[dh]))
print("Compared "+str(len(source_hashes))+" source files with "+str(len(destination_hashes))+" destination files in " + ("%.1f"% (time.time()-timestamp_before_compare))+"s")
print("In total "+str(len(matches))+" Matches found with a total size of " + ("%.2f"% (comm_match_filesize/float(10**9) ) ) + " GB")
if softlink or hardlink:
if softlink:
print("Creating softlinks...")
else:
print("Creating hardlinks...")
for source, destination in matches:
os.unlink(destination)
if softlink:
os.symlink(source, destination)
else:
os.link(source, destination)
print("Done.")
else:
print("Dry-run done.")
print("If you are happy with the dry-run use")
print(" --softlink to create softlinks")
print(" --hardlink to create hardlinks")
def save_hashes(file, data):
if os.path.exists(file):
logging.debug("Backing up old hashes")
backup_path = os.path.join(directory, "hashes_backup.json")
if os.path.exists(backup_path):
os.remove(backup_path)
os.rename(file, backup_path)
logging.info("Saving hashes json to "+file)
json.dump(data, open(file, 'w'))
logging.debug("Saving complete.")
def save_hashes_legacy(file, data):
logging.info("Saving hashes pickle to "+file)
with open(file, 'wb') as fp:
pickle.dump(data, fp, protocol=pickle.HIGHEST_PROTOCOL)
logging.debug("Saving complete.")
def load_hashes(file):
logging.info("Loading hash cache from json file.")
return json.load(open(file, 'r'))
def load_hashes_legacy(file):
logging.info("Loading hashes from pickle file "+file)
with open(file, 'rb') as fp:
data = pickle.load(fp)
return data
def progress_bar(current, total, text, bar_length=20):
fraction = current / total
arrow = int(fraction * bar_length - 1) * '-' + '>'
padding = int(bar_length - len(arrow)) * ' '
ending = '\n' if current == total else '\r'
print(f'{text}: [{arrow}{padding}] {int(fraction*100)}% {current}/{total}', end=ending)
def calculate_elapsed_time(time_left):
formatted_time = timedelta(seconds=time_left)
print(formatted_time)
# Splitting the components
# days, hours, minutes, _ = formatted_time.split(":")
days = formatted_time.days
hours, remainder = divmod(formatted_time.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
# Create a list to store the non-zero components
time_components = []
# Add non-zero components to the list
if days != 0:
time_components.append(f"{days} days")
if hours != 0:
time_components.append(f"{hours} hours")
if minutes != 0:
time_components.append(f"{minutes} minutes")
# Join the components into a human-readable string
elapsed_time_str = ", ".join(time_components)
return elapsed_time_str
def hash_file(file, hash_function=hashlib.sha1):
try:
filesize = os.path.getsize(file)
except:
return None
if filesize <= 0:
return None
BUF_SIZE = 65536 # 64kb chunks
parts = math.ceil(filesize / BUF_SIZE)
logging.info("Reading file '"+file+"' in "+str(parts)+" parts.")
hash_object = hash_function()
counter = 0
text = "'"+os.path.basename(file)+"' hash progress"
progress_bar(counter, parts, text, bar_length=20)
start_time = time.time()
try:
with open(file, 'rb') as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
hash_object.update(data)
if ((counter * 100 // parts) != ((counter+1) * 100 // parts)):
progress_bar(counter, parts, text, bar_length=20)
counter += 1
end_time = time.time()
print("Calculating " + hash_function.__name__[8:] + "hash of '"+os.path.basename(file)+"' took "+("%.1f" % (end_time-start_time))+" s. with avg. speed of "+("%.2f" % (os.path.getsize(file)/(end_time-start_time)/float(1<<20)))+" MB/s")
except OSError as error:
print("")
print(error)
print("")
return None
return hash_object.hexdigest()
def _file_size(file):
return os.path.getsize(file)
def safe_file_size(file):
try:
return _file_size(file)
except:
return 0
def has_file_hash(file, hash_function_name):
filesize = None
try:
filesize = _file_size(file)
except Exception as e:
print(e)
if filesize is not None:
basename = os.path.basename(file)
key = f"{filesize} {basename}"
return key in hashes[hash_function_name]
return False
def get_file_hash(file, hash_function, use_cache):
filesize = safe_file_size(file)
basename = os.path.basename(file)
key = f"{filesize} {basename}"
hash_function_name = hash_function.__name__[8:]
if use_cache and key in hashes[hash_function_name]:
if hashes[hash_function_name][key] is not None:
logging.info("Found file hash for "+str(key)+": "+hashes[hash_function_name][key])
else:
logging.info("Found None-hash for "+str(key))
return hashes[hash_function_name][key]
hash = hash_file(file, hash_function)
#if use_cache:
hashes[hash_function_name][key] = hash
save_hashes(hash_file_path, hashes)
if hash is not None:
logging.info("Calculated file hash for "+str(key)+": "+hash)
else:
logging.info("Could not calculate hash.")
return hash
def get_all_files(folder):
if type(folder) is list:
return [file for f in folder for file in get_all_files(f)]
l = []
for path, subdirs, files in os.walk(folder):
for name in files:
l.append(os.path.join(path, name))
logging.info("Found "+str(len(l))+" files in '"+folder+"'")
return l
directory = os.path.dirname(os.path.realpath(__file__))
hash_file_path = os.path.join(directory, "hashes.json")
hash_file_path_legacy = os.path.join(directory, "hashes.p")
migrate_database()
if os.path.exists(hash_file_path):
hashes = load_hashes(hash_file_path) # dict ( hash_function: dict( (name, size): hash))
else:
hashes = dict()
logging.info("Loaded "+str(sum([len(hashes[name]) for name in hashes]))+" previously calculated hashes.")
if __name__ == "__main__":
main()