-
Notifications
You must be signed in to change notification settings - Fork 1
/
filehelper.go
212 lines (176 loc) · 4.5 KB
/
filehelper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
package main
import (
"bytes"
"crypto/sha1"
"encoding/hex"
"errors"
"io"
"io/ioutil"
"log"
"net/http"
"os"
"path/filepath"
"regexp"
"strings"
"time"
"github.com/k3a/html2text"
"github.com/ledongthuc/pdf"
)
// RetrivesFilesFromUserPath return a []string of available files from given path
func RetrivesFilesFromUserPath(path string, includeFileExtensions []string, recursive bool) ([]string, error) {
var p []string
info, err := os.Stat(path)
if os.IsNotExist(err) {
return []string{}, errors.New("Input file not found")
}
if !info.IsDir() {
p = append(p, path)
} else {
if !recursive {
files, err := ioutil.ReadDir(path)
if err != nil {
return []string{}, err
}
for _, f := range files {
if !f.IsDir() {
p = append(p, path+string(os.PathSeparator)+f.Name())
}
}
} else {
err := filepath.Walk(path, func(walk string, info os.FileInfo, err error) error {
if err != nil {
log.Println(err)
}
if err == nil && !info.IsDir() && info.Size() > 0 && len(filepath.Ext(walk)) > 0 && (len(includeFileExtensions) == 0 || StringInSlice(filepath.Ext(walk), includeFileExtensions)) {
p = append(p, walk)
}
return nil
})
if err != nil {
log.Println(err)
}
}
}
return p, nil
}
// ExtractHashFromFile return an array of hash parsed from a given file
func ExtractHashFromFile(path string) ([]string, error) {
var fileContent string
var buffer []byte
var err error
buffer, err = ioutil.ReadFile(path)
if err != nil {
log.Fatal(err)
}
contentType := http.DetectContentType(buffer)
switch strings.Split(contentType, ";")[0] {
case "application/pdf":
fileContent, err = ReadPlainTextFromPDF(path)
if err != nil {
return []string{}, err
}
default:
fileContent = string(buffer)
}
return extractHashFromString(fileContent), nil
}
// ExtractHashFromURL try to get an HTML page, convert it to text and return an array of extracted hashs
func ExtractHashFromURL(url string) ([]string, error) {
var req *http.Request
var res *http.Response
var body []byte
var err error
req, err = http.NewRequest("GET", url, nil)
if err != nil {
return []string{}, err
}
req.Header.Set("User-Agent", USERAGENT)
req.Header.Set("Cache-Control", "no-cache")
client := &http.Client{Timeout: time.Second * HTTPTIMEOUT}
res, err = client.Do(req)
if err != nil {
return []string{}, err
}
defer res.Body.Close()
body, err = ioutil.ReadAll(res.Body)
if err != nil {
return []string{}, err
}
plain := html2text.HTML2Text(string(body))
return extractHashFromString(plain), nil
}
func extractHashFromString(content string) []string {
r := regexp.MustCompile(`(?i)([0-9a-f]{32,64})`)
return r.FindAllString(content, -1)
}
// ReadPlainTextFromPDF extract text from PDF File
func ReadPlainTextFromPDF(pdfpath string) (text string, err error) {
f, r, err := pdf.Open(pdfpath)
defer f.Close()
if err != nil {
return
}
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
return
}
buf.ReadFrom(b)
text = buf.String()
return
}
// UniqueSliceMembers Remove duplicate case insensitive entries inside the specified slice
func UniqueSliceMembers(in []string) []string {
var buffer []string
for _, o := range in {
if !StringInSlice(strings.ToLower(o), buffer) {
buffer = append(buffer, strings.ToLower(o))
}
}
return buffer
}
// StringInSlice check wether or not a string already is inside a specified slice
func StringInSlice(a string, list []string) bool {
for _, b := range list {
if b == a {
return true
}
}
return false
}
// HashContentFromFile return the sha1sum of the specified file
func HashContentFromFile(filePath string) (string, error) {
file, err := os.Open(filePath)
if err != nil {
return "", err
}
defer file.Close()
return sha1FromStream(file)
}
// HashContentFromURL return the sha1sum of the specified web content
func HashContentFromURL(url string) (string, error) {
var req *http.Request
var res *http.Response
var err error
req, err = http.NewRequest("GET", url, nil)
if err != nil {
return "", err
}
req.Header.Set("User-Agent", USERAGENT)
req.Header.Set("Cache-Control", "no-cache")
client := &http.Client{Timeout: time.Second * HTTPTIMEOUT}
res, err = client.Do(req)
if err != nil {
return "", err
}
defer res.Body.Close()
return sha1FromStream(res.Body)
}
func sha1FromStream(stream io.Reader) (string, error) {
hash := sha1.New()
if _, err := io.Copy(hash, stream); err != nil {
return "", err
}
hashInBytes := hash.Sum(nil)[:20]
return hex.EncodeToString(hashInBytes), nil
}