-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilterscrapelinks.go
127 lines (113 loc) · 2.66 KB
/
filterscrapelinks.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
package walker
import (
"net/url"
"strings"
"github.com/foomo/walker/vo"
"github.com/temoto/robotstxt"
)
type linkLimitations struct {
depth int
paging bool
ignoreAllQueries bool
ignorePathPrefixes []string
includePathPrefixes []string
ignoreQueriesWith []string
}
func NormalizeLink(baseURL *url.URL, linkURL string) (normalizedLink *url.URL, err error) {
// let us ditch anchors
anchorParts := strings.Split(linkURL, "#")
linkURL = anchorParts[0]
link, errParseLink := url.Parse(linkURL)
if errParseLink != nil {
err = errParseLink
return
}
// host
if link.Host == "" {
link.Host = baseURL.Host
}
// scheme
if link.Scheme == "" || link.Scheme == "//" {
link.Scheme = baseURL.Scheme
}
if baseURL.User != nil {
link.User = baseURL.User
}
// it is beautiful now
normalizedLink = link
return
}
func filterScrapeLinks(
linkList vo.LinkList,
baseURL *url.URL,
linkNextNormalized string,
linkPrevNormalized string,
ll linkLimitations,
robotsGroup *robotstxt.Group,
) (links map[string]int) {
links = map[string]int{}
LinkLoop:
for linkURL := range linkList {
// ok, time to really look at that url
linkU, errParseLinkU := NormalizeLink(baseURL, linkURL)
if errParseLinkU == nil {
// is it a pager link
if !ll.paging {
if linkNextNormalized == linkU.String() || linkPrevNormalized == linkU.String() {
continue LinkLoop
}
}
if linkU.Host != baseURL.Host || linkU.Scheme != baseURL.Scheme {
// ignoring external links
continue LinkLoop
}
if ll.depth > 0 {
// too deep?
if len(strings.Split(linkU.Path, "/"))-1 > ll.depth {
continue LinkLoop
}
}
// ignore path prefix
for _, ignorePrefix := range ll.ignorePathPrefixes {
if strings.HasPrefix(linkU.Path, ignorePrefix) {
continue LinkLoop
}
}
// robots say no
if robotsGroup != nil && !robotsGroup.Test(linkU.Path) {
continue LinkLoop
}
// are we ignoring it, because of the query
if len(linkU.Query()) > 0 {
// it has a query
if ll.ignoreAllQueries {
// no queries in general
continue LinkLoop
} else {
// do we filter a query parameter
for _, ignoreP := range ll.ignoreQueriesWith {
for pName := range linkU.Query() {
if pName == ignoreP {
continue LinkLoop
}
}
}
}
}
// are we looking at the path is it included in the paths
foundPath := false
for _, p := range ll.includePathPrefixes {
if strings.HasPrefix(linkU.Path, p) {
foundPath = true
break
}
}
if !foundPath {
// not in the scrape path
continue LinkLoop
}
links[linkU.String()]++
}
}
return links
}