forked from torbiak/gopl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
99 lines (92 loc) · 1.9 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// ex5.13 saves a local mirror of a website.
//
// Based on gopl.io/ch5/findlinks3
package main
import (
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"path/filepath"
"gopl.io/ch5/links"
)
// breadthFirst calls f for each item in the worklist.
// Any items returned by f are added to the worklist.
// f is called at most once for each item.
func breadthFirst(f func(item string) []string, worklist []string) {
seen := make(map[string]bool)
for len(worklist) > 0 {
items := worklist
worklist = nil
for _, item := range items {
if !seen[item] {
seen[item] = true
worklist = append(worklist, f(item)...)
}
}
}
}
var origHost string
func save(rawurl string) error {
url, err := url.Parse(rawurl)
if err != nil {
return fmt.Errorf("bad url: %s", err)
}
if origHost == "" {
origHost = url.Host
}
if origHost != url.Host {
return nil
}
dir := url.Host
var filename string
if filepath.Ext(filename) == "" {
dir = filepath.Join(dir, url.Path)
filename = filepath.Join(dir, "index.html")
} else {
dir = filepath.Join(dir, filepath.Dir(url.Path))
filename = url.Path
}
err = os.MkdirAll(dir, 0777)
if err != nil {
return err
}
resp, err := http.Get(rawurl)
if err != nil {
return err
}
defer resp.Body.Close()
file, err := os.Create(filename)
if err != nil {
return err
}
_, err = io.Copy(file, resp.Body)
if err != nil {
return err
}
// Check for delayed write errors, as mentioned at the end of section 5.8.
err = file.Close()
if err != nil {
return err
}
return nil
}
func crawl(url string) []string {
fmt.Println(url)
err := save(url)
if err != nil {
log.Printf(`can't cache "%s": %s`, url, err)
}
list, err := links.Extract(url)
if err != nil {
log.Printf(`can't extract links from "%s": %s`, url, err)
}
return list
}
func main() {
// Crawl the web breadth-first,
// starting from the command-line arguments.
breadthFirst(crawl, os.Args[1:])
}