-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathduper.js
executable file
·249 lines (216 loc) · 7.96 KB
/
duper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
var fs = require('fs');
var _ = require('lodash');
var crypto = require('crypto');
var fs_extra = require('fs-extra');
const hash_sampling_percent = 1; // what percentage of files to sample when hashing (100 = no sampling)
const dry_run = true;
const blocksize = 4096;
function adddir( path, progress_callback, progress ) {
//console.log("adddir( " + path + " )" );
if ( progress === undefined ) progress = [];
var files = fs.readdirSync( path );
var my_progress = {
current: 0,
count: files.length
}
progress.push( my_progress );
files.forEach( (f) => {
my_progress.current += 1;
progress_callback( progress );
var file = path + '/' + f;
var stats = fs.lstatSync( file );
if ( stats.isFile() ) {
addfile( file );
}
if ( stats.isDirectory() ) {
adddir( file, progress_callback, progress );
}
})
progress.pop();
}
var by_size = {}
function addfile( file ) {
var stats = fs.lstatSync( file );
var id = "size_" + stats.size;
if ( by_size[id] === undefined ) by_size[id] = [];
by_size[id].push( file );
}
function calc_hash( file ) {
var shasum = crypto.createHash( 'sha256' );
var stats = fs.lstatSync( file );
if ( hash_sampling_percent == 100 || stats.size < 1024*1024 ) {
shasum.update( fs.readFileSync( file ) );
} else {
var buffer = new Buffer.alloc( blocksize );
var block_count = stats.size / blocksize;
var loop_count = block_count * (hash_sampling_percent / 100);
var block_space = block_count / loop_count;
//console.log("file size", stats.size, "block_count", block_count, "loop_count", loop_count, "block_space", block_space);
var fd = fs.openSync( file, 'r' );
for (var i=0; i<loop_count; i+=1) {
var start = ( block_space * i ) * blocksize;
fs.readSync( fd, buffer, 0, blocksize, start );
shasum.update( buffer );
}
fs.closeSync( fd );
}
return shasum.digest( 'hex' );
}
var by_hash = {}
var by_hash_size = 0;
function addfile_by_hash( file, progress_callback ) {
var hash = calc_hash( file );
if ( by_hash[hash] === undefined ) by_hash[hash] = [];
by_hash[hash].push( file );
by_hash_size += 1;
progress_callback( by_hash_size );
}
function remove_files( base_path, destination, files ) {
files.forEach( (file) => {
var dest_file = file.replace( base_path, destination + '/' );
var dest_dir = dest_file.split('/').slice(0, -1).join('/');
if ( dry_run ) {
console.log("dry_run", file, " => ", dest_file);
} else {
fs_extra.mkdirpSync( dest_dir );
fs.renameSync( file, dest_file );
}
});
console.log('moved', files.length, 'files to "' + dest_dir + '" including directory structure')
}
// ---
console.log("welcome to duper.js: find and delete duplicate (by content) files\n");
console.log("\
usage:\n\tduper.js <directory> [<substring>:<preference>]*\n\n\
with\n\
<directory> \tthe folder to scan for duplicates\n\
<substring> \tstring to use for matching filename (incl. path)\n\
<preference> \t[keep|delete|protect] the action to preferrably take if filename (incl. path) matches <substring>\n\
how it works:\n\n\
see README.md\n\
");
// parse cmdline args
var dir = process.argv[2];
if ( dir === undefined ) dir = '.';
var prefs =
_.map(
_.takeRightWhile( process.argv, (a) => {
return a.indexOf(':') >= 0
}), (p) => {
return {
substring: p.split( ':' )[0],
preference: p.split( ':' )[1]
}
}
);
// scan filesystem recursively, clustering files by size into global by_size
console.log( '\nscanning filesystem...' );
var progress_count = 0;
adddir( dir, (progress) => {
progress_count += 1;
if ( progress_count % 50 == 0 ) {
var pstr = _.map( progress, (p) => {
return "" + p.current + '/' + p.count;
}).join( ' ' );
process.stdout.write( 'scanning progress: ' + pstr + ' \r');
}
});
// filter clusters having more than 1 file
dupes_by_size = _.filter( by_size, (entry) => {
return entry.length > 1;
});
console.log("dupes_by_size", dupes_by_size);
console.log('found', _.keys( dupes_by_size ).length, 'clusters by size match' );
// hash the duplicate files, clustering by file hash
console.log( '\ncalculating file hashes of files in equally-sized clusters... (using ' + hash_sampling_percent + '% content sampling)' );
var flat_dupes = _.flattenDeep( dupes_by_size );
flat_dupes.forEach( (file) => {
addfile_by_hash( file, (count) => {
process.stdout.write( 'hashing progress: ' + count + ' of ' + flat_dupes.length + '\r');
});
});
process.stdout.write('\n\n');
// filter clusters having more than 1 entry
//console.log("by_hash", by_hash);
var dupes_by_hash = _.filter( by_hash, (entry) => {
return entry.length > 1;
});
console.log('found', _.keys( dupes_by_hash ).length, 'clusters by hash match' );
console.log("dupes_by_hash", dupes_by_hash);
// select all except one file per cluster to delete
var deletion_list = _.map( dupes_by_hash, ( cluster ) => {
//console.log('\n------ checking cluster: ', cluster );
// map files in each cluster to actions by matching preference substring to filename
var actions = {
'delete': [],
'keep': [],
'protect': []
};
_.forEach( cluster, (file) => {
// find first matching preference
var matching_pref = _.filter( prefs, (pref) => {
return file.indexOf( pref.substring ) >= 0;
})[0];
if ( matching_pref ) {
actions[matching_pref.preference].push( file );
}
})
// add more items ot 'delete' action until all except one are added
var quantity_to_add = _.keys(cluster).length - actions['delete'].length - 1
if ( quantity_to_add > 0 ) {
// determine array of files neither in 'delete' nor 'keep' nor 'protect' (those can be added to 'delete')
var unmatched = _.filter( cluster, (file) => {
return (
actions['delete'].indexOf( file ) == -1 &&
actions['keep'].indexOf( file ) == -1 &&
actions['protect'].indexOf( file ) == -1
);
});
// also add 'keep' files as a last resort
unmatched = _.concat( unmatched, actions['keep']);
var additional_delete = unmatched.slice( 0, quantity_to_add)
actions['delete'] = actions['delete'].concat( additional_delete );
// add the remainder of the unmatched files to 'keep' (if not already in 'keep')
actions['keep'] = actions['keep'].concat(
_.difference( unmatched.slice( quantity_to_add ), actions['keep'] )
);
// since we made a decision (without using a rule) about which file to delete,
// we try to achieve some consistency regarding this selection in the future
// by adding preferences generated from differences between the tokenized names of
// the selected (for deletion) and the non-selected files
var keep_tokens = _.reduce( actions['keep'], (result, file) => {
return result.concat( file.split(/[^\w]/) );
}, []);
var protect_tokens = _.reduce( actions['protect'], (result, file) => {
return result.concat( file.split(/[^\w]/) );
}, []);
var selected_file_tokens = _.reduce( additional_delete, (result, file) => {
return result.concat( file.split(/[^\w]/) );
}, []);
var delete_tokens = _.difference( selected_file_tokens, _.concat( keep_tokens, protect_tokens ) );
// add the identifying tokens as substrings for delete preferences
delete_tokens.forEach( (token) => {
if (
_.map( prefs, ( pref ) => {
return pref.substring;
}).indexOf( token ) == -1
) {
prefs.push({
substring: token,
preference: 'delete'
});
}
});
}
// return files mapped to 'delete' action (but not too many)
return actions['delete'].slice( 0, _.keys(cluster).length - 1 );
});
deletion_list = _.flatten( deletion_list );
console.log("final prefs: ", _.reduce( prefs, (a, p) => { return a + " " + p.substring + ':' + p.preference; }, ""));
//console.log( "will delete the following files:\n" + deletion_list.join('\n') );
//console.log("deletion_list", deletion_list);
console.log("(", deletion_list.length, "files )");
dir = dir.replace(/\/+$/, "");
var dest_dir = dir.split('/').slice(0, -1).join('/') + '/removed_by_duper(' + dir.split('/').slice(-1)[0] + ')';
remove_files( dir, dest_dir, deletion_list );
console.log("", deletion_list.length, "files", (dry_run?"would":""), "have been moved to", dest_dir);