-
Notifications
You must be signed in to change notification settings - Fork 11
/
ceis-segment.go
71 lines (57 loc) · 1.33 KB
/
ceis-segment.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package main
import (
"flag"
"fmt"
"strings"
"github.com/PuerkitoBio/goquery"
)
func ExtractThings(url string, selector string) map[string]string {
var doc *goquery.Document
var e error
var ret map[string]string
ret = make(map[string]string)
if doc, e = goquery.NewDocument(url); e != nil {
fmt.Printf("ERROR: %s\n", e.Error())
return ret
}
doc.Find("script").Each(func (_ int, s *goquery.Selection) {
p := s.Parent().Nodes[0]
p.RemoveChild( s.Nodes[0] )
})
doc.Find(selector).Each(func (i int, s *goquery.Selection){
if link, ok := s.Attr("href"); ok {
var t string
lvl := 0
p := s
q := p.Parent()
for q.Find(selector).Size() <= 1 {
lvl++
q = q.Parent()
p = p.Parent()
}
t = ""
p.Contents().Each(func (i int, s *goquery.Selection) { t += " " + s.Text() })
t = strings.Replace(t, "\n", " ", -1)
t = strings.Replace(t, "\t", " ", -1)
title := ""
for _, _t := range strings.Split(t, " ") {
if _t != "" {
title += " " + _t
}
}
title = strings.TrimSpace(title);
ret[link] = title
}
})
return ret
}
// go run ceis-segment.go 'https://duckduckgo.com/html/?q=nihao' 'a'
func main() {
flag.Parse()
url := flag.Arg(0)
selector := flag.Arg(1)
things := ExtractThings(url, selector)
for i := range things {
fmt.Printf("%v\n%v\n\n", i, things[i])
}
}