-
Notifications
You must be signed in to change notification settings - Fork 4
/
processors.go
103 lines (86 loc) · 2.6 KB
/
processors.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
package scrape
import "net/url"
// processor defines minionDump process
type processor interface {
process(g *gru, md *minionDump) (proceed bool)
}
// processorFunc defines the processor func type
type processorFunc func(g *gru, md *minionDump) (proceed bool)
// process acts a proxy to underlying processor
func (pf processorFunc) process(g *gru, md *minionDump) (proceed bool) {
return pf(g, md)
}
// uniqueURLProcessor adds source url to unique crawled and remove any urls from the
// minion dump that are already crawled
func uniqueURLProcessor() processor {
return processorFunc(func(g *gru, md *minionDump) (proceed bool) {
g.scrappedUnique[md.sourceURL.String()]++
var unique []*url.URL
for _, u := range md.urls {
if _, ok := g.scrappedUnique[u.String()]; !ok {
unique = append(unique, u)
continue
}
g.scrappedUnique[u.String()]++
}
md.urls = unique
return true
})
}
// errorCheckProcessor check if the url scrape failed for any reason
func errorCheckProcessor() processor {
return processorFunc(func(g *gru, md *minionDump) (proceed bool) {
if md.err == nil {
return true
}
g.errorURLs[md.sourceURL.String()] = md.err
return false
})
}
// skippedURLProcessor will simply add the unknown urls to skipped map
func skippedURLProcessor() processor {
return processorFunc(func(g *gru, md *minionDump) (proceed bool) {
g.skippedURLs[md.sourceURL.String()] = append(g.skippedURLs[md.sourceURL.String()], md.invalidURLs...)
return true
})
}
// maxDepthCheckProcessor will add the unscrapped urls to scrapped if the max depth has been reached
func maxDepthCheckProcessor() processor {
return processorFunc(func(g *gru, md *minionDump) (proceed bool) {
if g.maxDepth == -1 || md.depth < g.maxDepth {
return true
}
// add all urls to scraped depth
if len(md.urls) < 1 {
return false
}
g.scrapped[md.depth] = append(g.scrapped[md.depth], md.urls...)
for _, u := range md.urls {
if g.domainRegex.MatchString(u.Hostname()) {
g.scrappedUnique[u.String()]++
continue
}
}
return false
})
}
// domainFilterProcessor will filter the md.urls and update skipped urls with unmatched urls
func domainFilterProcessor() processor {
return processorFunc(func(g *gru, md *minionDump) (proceed bool) {
if g.domainRegex == nil {
return true
}
m := []*url.URL{}
um := []string{}
for _, u := range md.urls {
if g.domainRegex.MatchString(u.Hostname()) {
m = append(m, u)
continue
}
um = append(um, u.String())
}
md.urls = m
g.skippedURLs[md.sourceURL.String()] = append(g.skippedURLs[md.sourceURL.String()], um...)
return true
})
}