-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapelite_test.go
115 lines (98 loc) · 3.01 KB
/
scrapelite_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package scrapelite
import (
"bytes"
"io"
"net/http"
"os"
"strings"
"testing"
"github.com/PuerkitoBio/goquery"
"github.com/stretchr/testify/assert"
)
func TestNewScraper(t *testing.T) {
tests := []struct {
name string
want *Scraper
}{
{name: "Test scraper default values are fine", want: &Scraper{CapturedHrefLinkFilter: nil, HrefLinks: make(chan string), CapturedDomainDocuments: make(chan *goquery.Document)}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := New()
assert.Nil(t, got.CapturedHrefLinkFilter)
assert.NotNil(t, got.CapturedDomainDocuments)
assert.NotNil(t, got.HrefLinks)
})
}
}
func TestScraper_InitWithValues(t *testing.T) {
var allowedDomain allowedDomainCallBack = func(url string) bool {
return true
}
wantScraper := &Scraper{CapturedHrefLinkFilter: allowedDomain, HrefLinks: nil}
wantScraper.CaptureDomainFilter = wantScraper.CapturedHrefLinkFilter
s := New()
assert.NotNil(t, s.CapturedHrefLinkFilter)
assert.NotNil(t, s.CaptureDomainFilter)
}
var TestBaseUrl = "https://example.com"
var TestHtml = `
<!DOCTYPE html>
<html>
<body>
<a href="https://example.com">Home</a>
<a href="/relative/path">Relative Link</a>
<a href="https://google.com">Google</a>
<div class="nested">
<a href="../parent/path">Parent Path</a>
<a>No Href</a>
</div>
</body>
</html>
`
// MockHttpClient is a mock *http.Client
// created to return a local HTML file instead
// of retrieving the HTML from an external source.
type MockHttpClient struct{}
func (m *MockHttpClient) Get(url string) (resp *http.Response, err error) {
// h contains example HTML for testing purposes.
res := &http.Response{StatusCode: 200, Body: io.NopCloser(bytes.NewBufferString(TestHtml))}
return res, nil
}
func TestScraperGetHrefs(t *testing.T) {
os.Stdout, _ = os.OpenFile(os.DevNull, os.O_WRONLY, 0)
s := New()
s.HttpClient = &MockHttpClient{}
s.Go(TestBaseUrl)
// Waiting until the first document arrives because
// we're not interested in testing crawling.
l := <-s.HrefLinks
// We have an array of candidates because
// we don't guarantee the order of the
// hrefs in the channel. On top of
// this, the order does not really matter
// only the fact that they exist matter.
// They will all be crawled eventually either way.
candidates := []string{"https://example.com", "https://google.com", "https://example.com/relative/path", "https://example.com/parent/path"}
var found bool
for _, c := range candidates {
if c == l {
found = true
}
}
assert.True(t, found, "expected to find a matching link")
}
func TestScraperGetDocuments(t *testing.T) {
os.Stdout, _ = os.OpenFile(os.DevNull, os.O_WRONLY, 0)
s := New()
s.HttpClient = &MockHttpClient{}
s.RequestsPerSecond = 200
s.RateLimit = true
s.Go(TestBaseUrl)
// Waiting until the first document arrives because
// we're not interested in testing crawling.
l := <-s.CapturedDomainDocuments
reader := strings.NewReader(TestHtml)
d, _ := goquery.NewDocumentFromReader(reader)
assert.Equal(t, l, d)
}