forked from get-set-fetch/scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Resource.ts
108 lines (84 loc) · 2.7 KB
/
Resource.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import Entity, { IStaticEntity } from './Entity';
export type IResourceParent = {
linkText?: string;
imgAlt?: string;
title?: string;
}
export type Proxy = {
host: string;
port: number;
}
/** Each url (web page, image, API endpoint, ...) represents a Resource. */
export default abstract class Resource extends Entity {
id: number;
projectId: number;
url: string;
actions: string[];
depth: number;
scrapedAt: Date;
scrapeInProgress: boolean;
/** response status code */
status: number;
/** response content-type */
contentType: string;
/** Stores text based content.
* Data rows with each row containing one or multiple entries.
* Usually each entry corresponds to content from a CSS selector.
*/
content: string[][];
/** Stores binary content. */
data: Uint8Array;
parent: IResourceParent;
/** not stored, populated by the ExtractUrlsPlugin and saved as new resources by the InsertResourcesPlugin */
resourcesToAdd: Partial<Resource>[];
/** not stored, populated by ConcurrencyManager based on the available proxy pool. If present, plugins should use this proxy when making requests. */
proxy: Proxy;
constructor(kwArgs: Partial<Resource> = {}) {
super(kwArgs);
if (!kwArgs.depth) {
this.depth = 0;
}
this.scrapeInProgress = !!this.scrapeInProgress;
if (kwArgs.scrapedAt && !(kwArgs.scrapedAt instanceof Date)) {
this.scrapedAt = new Date(kwArgs.scrapedAt);
}
if (typeof kwArgs.content === 'string') {
this.content = JSON.parse(kwArgs.content);
}
if (typeof kwArgs.parent === 'string') {
this.parent = JSON.parse(kwArgs.parent);
}
if (typeof kwArgs.actions === 'string') {
this.actions = JSON.parse(kwArgs.actions);
}
}
get dbCols() {
return [ 'id', 'projectId', 'url', 'actions', 'depth', 'scrapedAt', 'scrapeInProgress', 'status', 'contentType', 'content', 'data', 'parent' ];
}
toExecJSON() {
const jsonObj = { ...this.toJSON() };
/*
plugins running in DOM don't need the resource binary content
it's not worth passing it from node to DOM since it can take lots of memory
*/
delete jsonObj.data;
return jsonObj;
}
}
export type ResourceQuery = {
offset: number;
limit: number;
where: {
projectId: number;
[prop: string]: string|number;
},
whereNotNull: string[],
cols: string[];
}
export interface IStaticResource extends IStaticEntity {
new(kwArgs: Partial<Resource>): Resource;
getResource(projectId:number, url: string):Promise<Resource>;
getPagedResources(query: Partial<ResourceQuery>):Promise<Partial<Resource>[]>;
getAll(projectId: number):Promise<any[]>;
getResourceToScrape(projectId:number):Promise<Resource>;
}