Skip to content

Commit

Permalink
feat: Implement optional property on schema types
Browse files Browse the repository at this point in the history
  • Loading branch information
nzakas committed Sep 14, 2021
1 parent 404dc3c commit 6547ab7
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 25 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ const extractor = new DataExtractor({
},
stars: {
type: "number",
selector: "svg.octicon-star + span"
selector: "svg.octicon-star + span",
optional: true
}
});

Expand Down Expand Up @@ -87,9 +88,10 @@ There are several different schema types you can use.

### Primitive Types

There are three primitive schema types: `string`, `number`, and `boolean`. Each type has two possible properties:
There are three primitive schema types: `string`, `number`, and `boolean`. Each type has three possible properties:

1. `selector` (**required**) - the CSS selector to find the element.
1. `optional` - a boolean indicating if the element can be safely skipped. When `true`, if the CSS selector evaluates to `null` then the field is set to `undefined`; when omitted or `false`, an error is thrown when the CSS selector evaluates to `null`.
1. `convert` - a function used to convert the value into some other form. This function is run after the text is extracted and from the element and converted (for `number` and `boolean`), and before that text is inserted into the final data structure.

The primitive schema types all act the same except for how they convert the extracted value:
Expand Down
85 changes: 63 additions & 22 deletions src/schema-types.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ import { stringToBoolean, stringToNumber, identity } from "./converters.js";
*
* @typedef {Object} SchemaDef
* @property {string} selector The CSS selector to locate the element.
* @property {boolean} [optional=false] Indicates if the selector may not exist.
* @property {Function?} convert A conversion function that will initially
* receive the extracted data before placing it in the data structure
*
* @typedef {Object} ArraySchemaDef
* @property {string} selector The CSS selector to locate the element.
* @property {boolean} [optional=false] Indicates if the selector may not exist.
* @property {Function?} convert A conversion function that will initially
* receive the extracted data before placing it in the data structure
* @property {Object<string,SchemaDef>} items The schema for each item
Expand All @@ -42,13 +44,15 @@ import { stringToBoolean, stringToNumber, identity } from "./converters.js";
*
* @typedef {Object} ObjectSchemaDef
* @property {string} selector The CSS selector to locate the element.
* @property {boolean} [optional=false] Indicates if the selector may not exist.
* @property {Function?} convert A conversion function that will initially
* receive the extracted data before placing it in the data structure
* @property {Object<string,SchemaDef>} properties The schema for each
* property in the object.
*
* @typedef {Object} TableSchemaDef
* @property {string} selector The CSS selector to locate the element.
* @property {boolean} [optional=false] Indicates if the selector may not exist.
* @property {Function?} convert A conversion function that will initially
* receive the extracted data before placing it in the data structure
* @property {Array<SchemaDef>} head An array of schema definitions for the
Expand Down Expand Up @@ -78,6 +82,11 @@ import { stringToBoolean, stringToNumber, identity } from "./converters.js";
* @returns {string} The text from the element.
*/
function extractText(element) {

if (!element) {
return undefined;
}

switch (element.tagName) {
case "IMG":
return element.alt;
Expand All @@ -95,6 +104,16 @@ function extractText(element) {
}
}

/**
* Throws an error saying the selector wasn't found.
* @param {string} selector The selector that couldn't be found.
* @returns {void}
* @throws {Error} Always.
*/
function throwNotFound(selector) {
throw new Error(`Element matching "${selector}" could not be found.`);
}

//-----------------------------------------------------------------------------
// Functions
//-----------------------------------------------------------------------------
Expand All @@ -107,8 +126,17 @@ export const schemaTypes = {
* @param {ArraySchemaDef} def The schema definition for the array.
* @returns {Array} An array of data matching the definition.
*/
async array(root, { selector, items, convert = identity }) {
async array(root, { selector, optional, items, convert = identity }) {
const itemHandles = await root.$$(selector);

if (itemHandles.length === 0) {
if (optional) {
return undefined;
}

throwNotFound(selector);
}

const itemEntries = Object.entries(items);
const result = [];

Expand All @@ -131,20 +159,27 @@ export const schemaTypes = {
* @param {ArraySchemaDef} def The schema definition for the custom value.
* @returns {*} The value returned from Puppeteer.
*/
async custom(root, { selector, extract, convert = identity }) {
async custom(root, { selector, optional, extract, convert = identity }) {

if (typeof extract !== "function") {
throw new TypeError("Custom schema type must have extract() method.");
}

let value;
let handle = root;

if (selector) {
value = await root.$eval(selector, extract);
} else {
value = await root.evaluate(extract, root);
handle = await root.$(selector);

if (!handle) {
if (optional) {
return undefined;
}

throwNotFound(selector);
}
}

let value = await handle.evaluate(extract, handle);
return convert(value);
},

Expand All @@ -154,8 +189,8 @@ export const schemaTypes = {
* @param {SchemaDef} def The schema definition for the array.
* @returns {boolean} A boolean value representing the data.
*/
async boolean(root, { selector, convert = identity }) {
const value = await this.string(root, { selector });
async boolean(root, { selector, optional, convert = identity }) {
const value = await this.string(root, { selector, optional });
return convert(stringToBoolean(value));
},

Expand All @@ -165,8 +200,8 @@ export const schemaTypes = {
* @param {SchemaDef} def The schema definition for the array.
* @returns {number} A number value representing the data.
*/
async number(root, { selector, convert = identity }) {
const value = await this.string(root, { selector });
async number(root, { selector, optional, convert = identity }) {
const value = await this.string(root, { selector, optional });
return convert(stringToNumber(value));
},

Expand All @@ -176,8 +211,17 @@ export const schemaTypes = {
* @param {ObjectSchemaDef} def The schema definition for the array.
* @returns {Object<string,*>} An object of data matching the definition.
*/
async object(root, { selector, properties, convert = identity }) {
const handle = selector ? await root.$$(selector) : root;
async object(root, { selector, optional, properties, convert = identity }) {
const handle = selector ? await root.$(selector) : root;

if (!handle) {
if (optional) {
return undefined;
}

throwNotFound(selector);
}

const propertyEntries = Object.entries(properties);

const result = {};
Expand All @@ -195,17 +239,14 @@ export const schemaTypes = {
* @param {SchemaDef} def The schema definition for the array.
* @returns {string} A string value representing the data.
*/
async string(root, { selector, convert = identity } = {}) {
async string(root, { selector, optional, convert = identity } = {}) {

let value;

if (selector) {
value = await root.$eval(selector, extractText);
} else {
value = await root.evaluate(extractText, root);
}

return convert(value);
return this.custom(root, {
selector,
optional,
extract: extractText,
convert
});
},

/**
Expand Down
16 changes: 15 additions & 1 deletion tests/data-extractor.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ function blockAllBut(page, url) {
});
}

/**
* Normalizes an object by passing it through JSON methods.
* @param {Object} data The data to normalize.
* @returns {Object} The normalized data.
*/
function normalizeToJson(data) {
return JSON.parse(JSON.stringify(data));
}

//-----------------------------------------------------------------------------
// Schemas
//-----------------------------------------------------------------------------
Expand All @@ -78,6 +87,11 @@ const salaryPost = {
type: "string",
selector: "[itemprop=datePublished]"
},
missing: {
type: "string",
selector: "[foo=bar]",
optional: true
},
salaries: {
type: "table",
selector: "table",
Expand Down Expand Up @@ -187,7 +201,7 @@ describe("DataExtractor", () => {

await page.goto(url);
await page.waitForSelector("body");
const result = await extractor.extractFrom(page);
const result = normalizeToJson(await extractor.extractFrom(page));
expect(result).to.deep.equal(expected);
});
});
Expand Down

0 comments on commit 6547ab7

Please sign in to comment.