forked from crawlerclub/crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsers.go
74 lines (67 loc) · 1.62 KB
/
parsers.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
package main
import (
"encoding/json"
"flag"
"io/ioutil"
"path/filepath"
"strings"
"sync"
"crawler.club/crawler/rss"
"crawler.club/et"
"github.com/crawlerclub/ce"
)
var (
conf = flag.String("conf", "./conf", "dir for parsers conf")
)
type Parsers struct {
sync.Mutex
items map[string]*et.Parser
}
func (p *Parsers) GetParser(name string, refresh bool) (*et.Parser, error) {
p.Lock()
defer p.Unlock()
if !refresh && p.items[name] != nil {
return p.items[name], nil
}
file := filepath.Join(*conf, "parsers", name+".json")
content, err := ioutil.ReadFile(file)
if err != nil {
return nil, err
}
parser := new(et.Parser)
if err := json.Unmarshal(content, parser); err != nil {
return nil, err
}
p.items[name] = parser
return parser, nil
}
var pool = &Parsers{items: make(map[string]*et.Parser)}
func Parse(task *et.UrlTask, page, ip string) (
[]*et.UrlTask, []map[string]interface{}, error) {
name := task.ParserName
url := task.Url
switch strings.ToLower(name) {
case "rss_":
feeds, err := rss.Parse(url, page, task.Ext)
return nil, feeds, err
case "content_":
doc := ce.ParsePro(url, page, ip, false)
return nil, []map[string]interface{}{map[string]interface{}{"doc": doc, "ext": task.Ext}}, nil
case "link_":
links, err := et.ParseNewLinks(page, url)
if err != nil {
return nil, nil, err
}
var tasks []*et.UrlTask
for _, link := range links {
tasks = append(tasks, &et.UrlTask{ParserName: "content_", Url: link, Ext: task.Ext})
}
return tasks, nil, nil
default:
p, err := pool.GetParser(name, false)
if err != nil {
return nil, nil, err
}
return p.Parse(page, url)
}
}