-
Notifications
You must be signed in to change notification settings - Fork 2
/
dups.go
173 lines (164 loc) · 3.95 KB
/
dups.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// dups.go - a simple command line tool to find duplicate files in a
// directory tree.
//
package main
import (
"bufio"
"crypto/sha1"
"encoding/hex"
"flag"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"sort"
"sync"
"syscall"
)
var (
root *string = flag.String("root", "test", "root dir for dup check")
delete *bool = flag.Bool("delete", false, "do delete the longest dups")
emptydir *bool = flag.Bool("emptydir", false, "do delete empty directories as well")
ncpu *int = flag.Int("ncpu", runtime.NumCPU(), "number of cpu's to use")
)
const DEBUG = true
func debug(format string, a ...interface{}) {
if DEBUG {
fmt.Printf(format, a...)
}
}
type StringLenSorter []string
func (p StringLenSorter) Len() int { return len(p) }
func (p StringLenSorter) Less(i, j int) bool { return len(p[i]) >= len(p[j]) }
func (p StringLenSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
type HashResult struct {
FileName string
Err error
Hash []byte
}
func main() {
flag.Parse()
debug("ncpu %v\n", *ncpu)
runtime.GOMAXPROCS(*ncpu)
tree := make(map[int64][]string)
err := filepath.Walk(*root, func(path string, info os.FileInfo, err error) error {
if err != nil {
debug("walkFn path %v, err %v\n", path, err)
return err
}
// skip over non-regular files
if !info.Mode().IsRegular() {
return nil
}
debug("walkFn path %v\n", path)
sz := info.Size()
tree[sz] = append(tree[sz], path)
return nil
})
if err != nil {
panic(err.Error())
}
debug("tree %#v\n", tree)
for sz, flist := range tree {
if len(flist) < 2 {
continue
}
debug("File with size %v:\n", sz)
// we have a list of files with the same size, possibly candiates with equal content.
hashtree := make(map[[sha1.Size]byte][]string)
done := make(chan *HashResult)
req := make(chan *HashResult)
var wg sync.WaitGroup
wg.Add(*ncpu)
for i := 0; i < *ncpu; i++ {
go func() {
defer wg.Done()
for r := range req {
debug("req path %v\n", r.FileName)
f, err := os.Open(r.FileName)
if err != nil {
//panic(err.Error())
// continue if file can not be opened
r.Err = err
done <- r
continue
}
var reader *bufio.Reader
//reader, err = bufio.NewReaderSize(f, 4*1024*1024)
reader = bufio.NewReader(f)
hash := sha1.New()
_, err = io.Copy(hash, reader)
if err != nil {
panic(err.Error())
}
f.Close()
r.Hash = hash.Sum(nil)
debug("done %#v\n", r)
done <- r
}
}()
}
var killResultFetcher = make(chan bool)
var wk sync.WaitGroup
wk.Add(1)
go func() {
defer wk.Done()
for {
select {
case res := <-done:
if res.Err != nil {
fmt.Fprintf(os.Stderr, "%v: %v\n", res.FileName, res.Err)
} else {
var sum [sha1.Size]byte
copy(sum[:], res.Hash)
debug("path %v, hash %v\n", res.FileName, hex.EncodeToString(sum[:]))
hashtree[sum] = append(hashtree[sum], res.FileName)
}
case <-killResultFetcher:
return
}
}
}()
for _, path := range flist {
req <- &HashResult{FileName: path}
}
close(req)
wg.Wait()
killResultFetcher <- true
wk.Wait()
for sum, flist := range hashtree {
if len(flist) < 2 {
continue
}
sort.Sort(StringLenSorter(flist))
fmt.Printf("files with hash %v:\n%v\n", hex.EncodeToString(sum[:]), flist)
for _, file := range flist[:len(flist)-1] {
fmt.Printf("Deleting dup %v\n", file)
if *delete {
debug("really del %v\n", file)
err := os.Remove(file)
if err != nil {
panic(err.Error())
} else {
if *emptydir {
parent := filepath.Dir(file)
debug("attempt del dir %v\n", parent)
err := os.Remove(parent)
if err != nil {
if e, ok := err.(*os.PathError); ok {
if e.Err == syscall.ENOTEMPTY {
debug("%v is not empty\n", parent)
continue
}
}
panic(err.Error())
}
}
}
}
}
}
}
}