-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdirectorydeduplicator.py
273 lines (242 loc) · 13 KB
/
directorydeduplicator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#!/usr/bin/env python3
from more_itertools import flatten
from directory_node import DirectoryNode, MiNode
from duplicate_directory_set import DuplicateDirectorySet
from execution_result import ExecutionResult
from progress_tracker import track_file_hash_progress, track_fs_scan_progress
from util import bytes2human, format_elapsed_time, print_message
from multiprocessing import Process, Queue
from collections import defaultdict
from termcolor import colored
from time import time
from typing import Dict, List
import click
import os
import pickle
import sys
import xxhash
BUFFER_SIZE = 1024 * 1024 * 10 # 10MB
EMPTY_FILE_DIGEST = "EMPTY"
file_name_queue = Queue()
bytes_processed_queue = Queue()
def build_metadata_tree(directory_path: str, follow_symlinks: bool = False) -> DirectoryNode:
node = DirectoryNode(directory_path)
try:
entries = os.scandir(directory_path)
except PermissionError:
print(f"Could not open directory {directory_path}: permission denied",
file=sys.stderr)
return node
for entry in sorted(entries, key=lambda x: x.name):
try:
if entry.is_symlink() and not follow_symlinks:
continue
elif entry.is_file():
file_size = entry.stat().st_size
node.files[entry.path] = MiNode(entry.path, file_size)
node.disk_space += file_size
node.num_files += 1
file_name_queue.put(entry.name) # for filesystem scanning, the specific values in the queue are unused. Only the number of files is tracked.
elif entry.is_dir():
subdir_node = build_metadata_tree(entry.path, follow_symlinks)
node.disk_space += subdir_node.disk_space
node.num_files += subdir_node.num_files
node.num_subdirectories += subdir_node.num_subdirectories + 1
node.subdir_nodes[entry.path] = subdir_node
except Exception:
print(f"Could not read metadata for path {entry.path}", file=sys.stderr)
return node
def get_summary_str(file_or_folder_name: str, digest: str, match_names: bool) -> str:
return f"{file_or_folder_name}_{digest}" if match_names else digest
def get_file_summary(file_node: MiNode) -> str:
if file_node.disk_space == 0:
# TODO improve handling of empty files (exclude from results by default?)
return EMPTY_FILE_DIGEST
hash_builder = xxhash.xxh3_128()
digest: str = ""
try:
# hash the contents of the file
with open(file_node.path, "rb") as current_file:
file_name_queue.put(file_node.name)
while file_chunk := current_file.read(BUFFER_SIZE):
hash_builder.update(file_chunk)
bytes_processed_queue.put(len(file_chunk))
digest = hash_builder.hexdigest()
except (PermissionError, OSError):
print(f"Could not open file {file_node.path}",
file=sys.stderr)
# Hashing the full file path should ensure a unique hash,
# preventing this directory (and its parents) from being considered.
# Is this desirable behavior?
content = f"Couldn't Read: {file_node.path}".encode()
digest = xxhash.xxh3_128_hexdigest(content)
return digest
def build_hash_map(node: DirectoryNode,
working_hash_map: Dict[str, List[DirectoryNode]],
match_names: bool = False) -> str:
file_hashes: List[str] = []
for file_node in node.files.values():
file_summary: str = get_summary_str(file_node.name, get_file_summary(file_node), match_names)
file_hashes.append(file_summary)
subdir_hashes: List[str] = []
for child_node in node.subdir_nodes.values():
subdir_digest = build_hash_map(child_node, working_hash_map, match_names)
subdir_summary = get_summary_str(child_node.name, subdir_digest, match_names)
subdir_hashes.append(subdir_summary)
node_summary: str = ",".join(sorted(file_hashes) + sorted(subdir_hashes))
node.hash = xxhash.xxh3_128_hexdigest(node_summary.encode())
working_hash_map[node.hash].append(node)
return node.hash
def simplify_duplicates(hash_map: Dict[str, List[DirectoryNode]]) -> None:
# Since dictionaries are ordered (as of python 3.7), we know that later hashes/entries/nodes
# will be parents of earlier nodes. We can used the reversed() iterator (supported as of 3.8)
# to start from the root node's hash, and work our way "down" the tree.
for hash, nodes in reversed(hash_map.items()):
if len(list(filter(lambda n: not n.exclude, nodes))) > 1: # there are multiple directories with this hash (duplicates)
simplify_duplicates_helper(hash, hash_map)
def simplify_duplicates_helper(hash: str, hash_map: Dict[str, List[DirectoryNode]]) -> None:
curr_nodes = hash_map[hash]
dirs_with_hash = len(curr_nodes)
for child_nodes_for_hash in map(lambda n: hash_map[n.hash], curr_nodes[0].subdir_nodes.values()):
if len(child_nodes_for_hash) == dirs_with_hash: # we don't want to report subdirectories of duplicate dirs...
for n in child_nodes_for_hash:
n.exclude = True
else: # ... unless there is at least one additional directory that is a duplicate of this subdirectory
# In this case, we want to exclude all but one of the original set of duplicates,
# so the "space saved" calculation can remain accurate.
# You are not expected to understand this
child_nodes_of_curr_nodes = list(flatten(map(lambda n: filter(lambda n2: n2.hash == child_nodes_for_hash[0].hash, n.subdir_nodes.values()), curr_nodes)))
for n in child_nodes_of_curr_nodes[1:]: # don't exclude one of the children of the original nodes
n.exclude = True
simplify_duplicates_helper(child_nodes_for_hash[0].hash, hash_map)
def find_duplicate_directory_sets(
directory_hash_map: Dict[str, List[DirectoryNode]]
) -> List[DuplicateDirectorySet]:
duplicate_directory_sets = []
for directory_set in directory_hash_map.values():
filtered_dir_set = list(filter(lambda n: not n.exclude, directory_set))
if len(filtered_dir_set) > 1:
num_files = directory_set[0].num_files
num_subdirs = directory_set[0].num_subdirectories
disk_space = directory_set[0].disk_space
duplicate_directory_sets.append(
DuplicateDirectorySet(disk_space, num_files, num_subdirs,
filtered_dir_set))
return duplicate_directory_sets
@click.command()
@click.argument("directory-path",
type=click.Path(exists=True, file_okay=False))
@click.option("--follow-symlinks",
is_flag=True,
default=False,
help="Follow symbolic links")
@click.option("--match-names",
is_flag=True,
default=False,
help="Require file & subdirectory names to match")
@click.option("--import-file",
# type=click.Path(allow_dash=False, dir_okay=False, exists=True, file_okay=True),
multiple=True,
default=[],
help="""Import data from a previous scan and compare with current scan.
Can take the form of a path or <tag>=<path>, where <tag> is used to annotate results from this import.
Can be used mutliple times to import multiple files.""")
@click.option("--export-file",
# type=click.Path(writable=True, dir_okay=False, exists=True),
help="""Export scan data for future import.
Can take the form of a path or <tag>=<path>, where <tag> is used to annotate results from this export.
If no tag is provided, a default tag will be generated based on the export file name.""")
@click.option("--report-path",
type=click.Path(writable=True, exists=False, allow_dash=True),
default="-",
help="Path to write final report for deduplication (Default: \"-\")")
def run(directory_path: str, follow_symlinks: bool, match_names: bool, import_file: List[str], export_file: str, report_path: str) -> None:
# import_file is a list but because of limitations with Click, the name is singular. Renaming for clarity.
import_files: List[str] = import_file
print("Scanning file metadata...")
fs_scan_start_time: float = time()
fs_scan_tracker = Process(target=track_fs_scan_progress, name="fs_scan_tracker", args=(file_name_queue,), daemon=True)
fs_scan_tracker.start()
root_node: DirectoryNode = build_metadata_tree(directory_path=directory_path, follow_symlinks=follow_symlinks)
fs_scan_tracker.terminate()
fs_scan_tracker.join() # block/wait until the process is actually killed
fs_scan_elapsed_time: str = format_elapsed_time(time() - fs_scan_start_time)
num_dirs = root_node.num_subdirectories + 1 # add one to num_subdirectories for root node
print_message(f"Found {root_node.num_files} files ({bytes2human(root_node.disk_space)}), {num_dirs} folders in {fs_scan_elapsed_time}",
line_width=80, file=sys.stderr)
print()
directory_hash_map = defaultdict(list)
# load/import results from files, if provided
for import_path in import_files:
import_tag: str = ""
if "=" in import_path: # pull out tag name if provided
import_tag: str = import_path[:import_path.index("=")]
import_path: str = import_path[import_path.index("=") + 1:]
with open(import_path, 'rb') as f:
print(f"Importing results from {import_path}...", end="")
imported_results: ExecutionResult = pickle.load(f)
if not import_tag:
import_tag = imported_results.tag # if no tag is provided, use the tag saved with the results to import
for hash, nodes in imported_results.hashes.items():
for n in nodes:
if not n.tag: # keep original tags when nodes are imported, exported, then re-imported
n.tag = import_tag
directory_hash_map[hash].extend(nodes)
print("done")
file_hash_start_time: float = time()
file_hash_tracker = Process(target=track_file_hash_progress, name="file_hash_tracker", args=(file_name_queue, bytes_processed_queue), daemon=True)
file_hash_tracker.start()
build_hash_map(root_node, directory_hash_map, match_names)
file_hash_tracker.terminate()
file_hash_tracker.join() # block/wait until the process is actually killed
file_name_queue.close()
file_name_queue.join_thread()
bytes_processed_queue.close()
bytes_processed_queue.join_thread()
file_hash_elapsed_time: str = format_elapsed_time(time() - file_hash_start_time)
num_dirs = root_node.num_subdirectories + 1 # add one to num_subdirectories for root node
print_message(f"Scanned {root_node.num_files} files ({bytes2human(root_node.disk_space)}), " +
f"{num_dirs} folders in {file_hash_elapsed_time}",
file=sys.stderr)
print()
if export_file:
export_tag = os.path.splitext(export_file)[0]
if "=" in export_file: # pull out tag name if provided
export_tag: str = export_file[:export_file.index("=")]
export_file = export_file[export_file.index("=") + 1:]
print(f"Exporting results to {export_file}...", end="")
with open(export_file, 'wb') as f:
run_result: ExecutionResult = ExecutionResult(root_node, directory_hash_map, export_tag)
pickle.dump(run_result, f)
print(" done")
simplify_duplicates(directory_hash_map)
duplicate_directory_sets = find_duplicate_directory_sets(
directory_hash_map)
potential_space_savings = 0
filtered_results = filter(lambda directory_set: any(map(lambda n : not n.tag, directory_set.directory_nodes)), duplicate_directory_sets)
write_report_to_file = not report_path == "-"
report_destination = open(report_path, 'w') if write_report_to_file else sys.stdout
try:
for dir_set in sorted(filtered_results,
key=lambda x: x.disk_space,
reverse=True):
summary = ", ".join([
f"{dir_set.num_files} files",
f"{dir_set.num_subdirectories} folders",
f"{bytes2human(dir_set.disk_space)}"
])
print(f"Duplicate directory set ({summary}):", file=report_destination)
for node in dir_set.directory_nodes:
node_tag_str = ""
if node.tag:
# don't colorize output when writing to disk
node_tag_str = f"({node.tag if write_report_to_file else colored(node.tag, 'green')})"
print(f"\t{node_tag_str} {node.path}", file=report_destination)
potential_space_savings += dir_set.disk_space * (
len(dir_set.directory_nodes) - 1)
if write_report_to_file:
print(f"Wrote output report to {report_path}")
finally:
if write_report_to_file:
report_destination.close()
print(f"Potential space savings: {bytes2human(potential_space_savings)}")