-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.go
263 lines (219 loc) · 6.75 KB
/
extract.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
package extract
import (
"bytes"
"encoding/json"
"fmt"
extractor "github.com/aafeher/go-microdata-extract/extractors"
"io"
"net/http"
"sync"
"time"
)
type (
// Extractor is a struct used for extracting metadata from web content or a provided URL. It utilizes various processors.
Extractor struct {
cfg config
url string
content string
extracted map[Syntax]any
errs []error
}
// config represents configuration settings for an Extractor, including syntax options, user agent, and fetch timeout.
config struct {
syntaxes []Syntax
userAgent string
fetchTimeout uint8
}
// Processor represents a data structure to hold a processor's name and function for extracting metadata.
Processor struct {
Name Syntax
Func func() (any, []error)
}
Syntax string
)
const (
// SyntaxOpenGraph is the identifier used for the Open Graph metadata syntax.
SyntaxOpenGraph Syntax = "opengraph"
// SyntaxXCards is the identifier used for the X Cards metadata syntax.
SyntaxXCards Syntax = "xcards"
// SyntaxJSONLD is the identifier used for the JSON-LD metadata syntax.
SyntaxJSONLD Syntax = "json-ld"
// SyntaxMicrodata is the identifier used for the W3C Microdata metadata syntax.
SyntaxMicrodata Syntax = "microdata"
)
// SYNTAXES defines an array of metadata syntax identifiers supported for parsing.
var SYNTAXES = []Syntax{SyntaxOpenGraph, SyntaxXCards, SyntaxJSONLD, SyntaxMicrodata}
// New creates a new instance of Extractor with default configurations and an empty map for extracted data.
func New() *Extractor {
e := &Extractor{
extracted: make(map[Syntax]any),
}
e.setConfigDefaults()
return e
}
// setConfigDefaults initializes the Extractor with default configuration settings.
func (e *Extractor) setConfigDefaults() {
e.cfg = config{
syntaxes: SYNTAXES,
userAgent: "go-microdata-extract (+https://github.com/aafeher/go-microdata-extract/blob/main/README.md)",
fetchTimeout: 3,
}
}
// SetSyntaxes sets the syntaxes that the Extractor will use for parsing metadata. Filters out unsupported syntaxes.
// syntaxes: A slice of Syntax representing the desired syntaxes.
// Returns the updated Extractor instance.
func (e *Extractor) SetSyntaxes(syntaxes []Syntax) *Extractor {
if len(syntaxes) == 0 {
return e
}
syntaxesToSet := make([]Syntax, 0)
for _, syntax := range syntaxes {
if contains(SYNTAXES, syntax) {
syntaxesToSet = append(syntaxesToSet, syntax)
}
}
if len(syntaxesToSet) == 0 {
return e
}
e.cfg.syntaxes = syntaxesToSet
return e
}
// SetUserAgent sets the User-Agent header for the HTTP client used by the Extractor.
// userAgent: A string representing the User-Agent to set for HTTP requests.
// Returns the updated Extractor instance.
func (e *Extractor) SetUserAgent(userAgent string) *Extractor {
e.cfg.userAgent = userAgent
return e
}
// SetFetchTimeout sets the HTTP client's fetch timeout value in seconds.
// fetchTimeout: A uint8 value representing the timeout duration in seconds.
// Returns the updated Extractor instance.
func (e *Extractor) SetFetchTimeout(fetchTimeout uint8) *Extractor {
e.cfg.fetchTimeout = fetchTimeout
return e
}
// Extract retrieves metadata from the specified URL or provided content and processes it using various parsers.
// url: The URL to extract metadata from.
// urlContent: Optional pointer to a string containing HTML content. If nil, the content at the URL will be fetched.
func (e *Extractor) Extract(url string, urlContent *string) (*Extractor, error) {
var err error
var mu sync.Mutex
var wg sync.WaitGroup
e.url = url
e.content, err = e.setContent(urlContent)
if err != nil {
e.errs = append(e.errs, err)
return e, err
}
var processors []Processor
if contains(e.cfg.syntaxes, SyntaxOpenGraph) {
processors = append(processors, Processor{
Name: SyntaxOpenGraph,
Func: func() (any, []error) {
return extractor.ParseOpenGraph(e.url, e.content)
},
})
}
if contains(e.cfg.syntaxes, SyntaxXCards) {
processors = append(processors, Processor{
Name: SyntaxXCards,
Func: func() (any, []error) {
return extractor.ParseXCards(e.url, e.content)
},
})
}
if contains(e.cfg.syntaxes, SyntaxJSONLD) {
processors = append(processors, Processor{
Name: SyntaxJSONLD,
Func: func() (any, []error) {
return extractor.JSONLD(e.url, e.content)
},
})
}
if contains(e.cfg.syntaxes, SyntaxMicrodata) {
processors = append(processors, Processor{
Name: SyntaxMicrodata,
Func: func() (any, []error) {
return extractor.W3CMicrodata(e.url, e.content)
},
})
}
for _, processor := range processors {
wg.Add(1)
proc := processor
go func(proc Processor) {
defer wg.Done()
extracted, errorsExtracted := proc.Func()
mu.Lock()
defer mu.Unlock()
e.errs = append(e.errs, errorsExtracted...)
e.extracted[proc.Name] = extracted
}(proc)
}
wg.Wait()
return e, nil
}
// setContent sets the content for the Extractor, fetching from URL if necessary. Returns the content or an error.
func (e *Extractor) setContent(urlContent *string) (string, error) {
if urlContent != nil {
return *urlContent, nil
}
mainURLContent, err := e.fetch(e.url)
if err != nil {
return "", err
}
return string(mainURLContent), nil
}
// fetch retrieves the content from the specified URL. Returns the fetched content as a byte slice or an error if failed.
func (e *Extractor) fetch(url string) ([]byte, error) {
var body bytes.Buffer
client := &http.Client{
Timeout: time.Duration(e.cfg.fetchTimeout) * time.Second,
}
req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", e.cfg.userAgent)
response, err := client.Do(req)
if err != nil {
return nil, err
}
if response.StatusCode != http.StatusOK {
return nil, fmt.Errorf("received HTTP status %d", response.StatusCode)
}
defer func(Body io.ReadCloser) {
_ = Body.Close()
}(response.Body)
_, err = io.Copy(&body, response.Body)
if err != nil {
return nil, err
}
return body.Bytes(), nil
}
// GetExtracted returns the extracted metadata as a map by processor name from the Extractor instance.
func (e *Extractor) GetExtracted() map[Syntax]any {
return e.extracted
}
// GetExtractedJSON returns the extracted metadata as a JSON-formatted byte array with indentation.
func (e *Extractor) GetExtractedJSON() json.RawMessage {
extractedJSON, errJSON := json.MarshalIndent(e.extracted, "", " ")
if errJSON != nil {
e.errs = append(e.errs, errJSON)
}
return extractedJSON
}
// index returns the index of the first occurrence of v in s,
// or -1 if not present.
func index[S ~[]E, E comparable](s S, v E) int {
for i := range s {
if v == s[i] {
return i
}
}
return -1
}
// contains reports whether v is present in s.
func contains[S ~[]E, E comparable](s S, v E) bool {
return index(s, v) >= 0
}