forked from get-set-fetch/scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ExtractUrlsPlugin.ts
180 lines (152 loc) · 5.95 KB
/
ExtractUrlsPlugin.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import { SchemaType } from '../../schema/SchemaHelper';
import Plugin from '../Plugin';
import Project from '../../storage/base/Project';
import Resource from '../../storage/base/Resource';
import { IDomClientConstructor, IDomNode } from '../../domclient/DomClient';
import NativeClient from '../../domclient/NativeClient';
/** Extracts new URLs to be scraped based on CSS selectors. Runs in browser. */
export default class ExtractUrlsPlugin extends Plugin {
static get schema() {
return {
type: 'object',
title: 'Extract Urls Plugin',
description: 'Extracts new (html or binary) resource urls using CSS selectors.',
properties: {
selectorPairs: {
type: 'array',
items: {
type: 'object',
properties: {
urlSelector: {
type: 'string',
},
titleSelector: {
type: 'string',
},
},
required: [ 'urlSelector' ],
},
default: [ { urlSelector: 'a[href$=".html"]' } ],
description: 'CSS selectors to be applied. By defining an optional titleSelector, when exporting binary resources, the generated filename will be prefixed by the titleSelector value.',
},
maxDepth: {
type: 'integer',
default: -1,
title: 'Max Depth',
description: 'Maximum depth of resources to be scraped. The starting resource has depth 0. Resources discovered from it have depth 1 and so on. A value of -1 disables this check.',
},
domRead: {
type: 'boolean',
default: true,
},
},
} as const;
}
opts: SchemaType<typeof ExtractUrlsPlugin.schema>;
/** in case of dynamic resource, urls already added */
prevUrls: Set<string>;
document: IDomNode;
constructor(opts: SchemaType<typeof ExtractUrlsPlugin.schema> = {}) {
super(opts);
this.prevUrls = new Set<string>();
}
test(project: Project, resource: Resource) {
if (!resource) return false;
// don't extract new resources from non-parsable pages
const validContentType = (/html/i).test(resource.contentType);
if (!validContentType) return false;
// don't extract new resources if max depth has been reached
const validMaxDepth = this.opts.maxDepth === -1 ? true : resource.depth < this.opts.maxDepth;
if (!validMaxDepth) return false;
return true;
}
apply(project: Project, resource: Resource, DomClient?: IDomClientConstructor) {
this.document = DomClient ? new DomClient(resource.data) : new NativeClient(document.querySelector('body'));
const allResourcesToAdd: Partial<Resource>[] = this.extractResources(resource);
const resourcesToAdd = this.diffAndMerge(allResourcesToAdd);
return resourcesToAdd.length > 0 ? { resourcesToAdd } : null;
}
extractResources(resource:Resource): Partial<Resource>[] {
const currentUrl = new URL(resource.url);
const resources = this.opts.selectorPairs.reduce(
(resources, selectorPair) => {
/*
sometimes the link innerText or img alt text is not enough to uniquely differentiate between child urls..
ex: extracting pdf files from a project where on each page is a link with "Export" text
if we are to rename the pdf files based on link innerText, all pdf files will result in the name 'export.pdf'
to avoid this, an extra, optional title selector is added
is responsible for linking link(s) with some other elm innerText from the page, like, for ex, h2.page-title
*/
const { urlSelector, titleSelector } = selectorPair;
const selectorResources = this.extractSelectorResources(urlSelector, titleSelector);
return resources.concat(selectorResources);
},
[],
);
resources.forEach(resource => {
// construct resource full URL without #hhtml_fragment_identifiers
const fullUrl = new URL(resource.url, currentUrl);
fullUrl.hash = '';
if (this.isValidResourceUrl(fullUrl)) {
// eslint-disable-next-line no-param-reassign
resource.url = fullUrl.toString();
}
});
const uniqueResources = [];
const uniqueUrls = [];
resources.forEach(resource => {
if (!uniqueUrls.includes(resource.url)) {
uniqueResources.push(resource);
uniqueUrls.push(resource.url);
}
});
return uniqueResources;
}
extractSelectorResources(urlSelector: string, titleSelector: string): Partial<Resource>[] {
const titles: string[] = titleSelector ? Array.from(this.document.querySelectorAll(titleSelector)).map((titleNode:IDomNode) => titleNode.getAttribute('innerText').trim()) : [];
const resources: Partial<Resource>[] = Array.from(this.document.querySelectorAll(urlSelector)).map((elm:IDomNode, idx) => {
let resource: Partial<Resource> = null;
if (elm.getAttribute('href')) {
resource = {
url: elm.getAttribute('href'),
parent: {
linkText: elm.getAttribute('innerText'),
},
};
}
if (elm.getAttribute('src')) {
resource = {
url: elm.getAttribute('src'),
parent: {
imgAlt: elm.getAttribute('alt'),
},
};
}
if (resource && titles.length > 0) {
resource.parent.title = titles.length > idx ? titles[idx] : titles[titles.length - 1];
}
return resource;
});
return resources.filter(resource => resource !== null);
}
isValidResourceUrl(resourceUrl) {
// check valid protocol
if (resourceUrl.protocol.match(/^(http:|https:)$/) === null) {
return false;
}
// check valid pathname
if (resourceUrl.pathname === null) {
return false;
}
return true;
}
diffAndMerge(resourcesToAdd: Partial<Resource>[]) {
return resourcesToAdd.filter(resource => {
if (!this.prevUrls.has(resource.url)) {
this.prevUrls.add(resource.url);
return true;
}
return false;
});
}
}