From 6547ab7f4eb9a192bd4ba428ce42aedc4dafd7a9 Mon Sep 17 00:00:00 2001 From: "Nicholas C. Zakas" Date: Tue, 14 Sep 2021 11:13:35 -0700 Subject: [PATCH] feat: Implement optional property on schema types --- README.md | 6 ++- src/schema-types.js | 85 ++++++++++++++++++++++++++---------- tests/data-extractor.test.js | 16 ++++++- 3 files changed, 82 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 09018d3..08a4fa2 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,8 @@ const extractor = new DataExtractor({ }, stars: { type: "number", - selector: "svg.octicon-star + span" + selector: "svg.octicon-star + span", + optional: true } }); @@ -87,9 +88,10 @@ There are several different schema types you can use. ### Primitive Types -There are three primitive schema types: `string`, `number`, and `boolean`. Each type has two possible properties: +There are three primitive schema types: `string`, `number`, and `boolean`. Each type has three possible properties: 1. `selector` (**required**) - the CSS selector to find the element. +1. `optional` - a boolean indicating if the element can be safely skipped. When `true`, if the CSS selector evaluates to `null` then the field is set to `undefined`; when omitted or `false`, an error is thrown when the CSS selector evaluates to `null`. 1. `convert` - a function used to convert the value into some other form. This function is run after the text is extracted and from the element and converted (for `number` and `boolean`), and before that text is inserted into the final data structure. The primitive schema types all act the same except for how they convert the extracted value: diff --git a/src/schema-types.js b/src/schema-types.js index 3dab02d..b55afac 100644 --- a/src/schema-types.js +++ b/src/schema-types.js @@ -22,11 +22,13 @@ import { stringToBoolean, stringToNumber, identity } from "./converters.js"; * * @typedef {Object} SchemaDef * @property {string} selector The CSS selector to locate the element. + * @property {boolean} [optional=false] Indicates if the selector may not exist. * @property {Function?} convert A conversion function that will initially * receive the extracted data before placing it in the data structure * * @typedef {Object} ArraySchemaDef * @property {string} selector The CSS selector to locate the element. + * @property {boolean} [optional=false] Indicates if the selector may not exist. * @property {Function?} convert A conversion function that will initially * receive the extracted data before placing it in the data structure * @property {Object} items The schema for each item @@ -42,6 +44,7 @@ import { stringToBoolean, stringToNumber, identity } from "./converters.js"; * * @typedef {Object} ObjectSchemaDef * @property {string} selector The CSS selector to locate the element. + * @property {boolean} [optional=false] Indicates if the selector may not exist. * @property {Function?} convert A conversion function that will initially * receive the extracted data before placing it in the data structure * @property {Object} properties The schema for each @@ -49,6 +52,7 @@ import { stringToBoolean, stringToNumber, identity } from "./converters.js"; * * @typedef {Object} TableSchemaDef * @property {string} selector The CSS selector to locate the element. + * @property {boolean} [optional=false] Indicates if the selector may not exist. * @property {Function?} convert A conversion function that will initially * receive the extracted data before placing it in the data structure * @property {Array} head An array of schema definitions for the @@ -78,6 +82,11 @@ import { stringToBoolean, stringToNumber, identity } from "./converters.js"; * @returns {string} The text from the element. */ function extractText(element) { + + if (!element) { + return undefined; + } + switch (element.tagName) { case "IMG": return element.alt; @@ -95,6 +104,16 @@ function extractText(element) { } } +/** + * Throws an error saying the selector wasn't found. + * @param {string} selector The selector that couldn't be found. + * @returns {void} + * @throws {Error} Always. + */ +function throwNotFound(selector) { + throw new Error(`Element matching "${selector}" could not be found.`); +} + //----------------------------------------------------------------------------- // Functions //----------------------------------------------------------------------------- @@ -107,8 +126,17 @@ export const schemaTypes = { * @param {ArraySchemaDef} def The schema definition for the array. * @returns {Array} An array of data matching the definition. */ - async array(root, { selector, items, convert = identity }) { + async array(root, { selector, optional, items, convert = identity }) { const itemHandles = await root.$$(selector); + + if (itemHandles.length === 0) { + if (optional) { + return undefined; + } + + throwNotFound(selector); + } + const itemEntries = Object.entries(items); const result = []; @@ -131,20 +159,27 @@ export const schemaTypes = { * @param {ArraySchemaDef} def The schema definition for the custom value. * @returns {*} The value returned from Puppeteer. */ - async custom(root, { selector, extract, convert = identity }) { + async custom(root, { selector, optional, extract, convert = identity }) { if (typeof extract !== "function") { throw new TypeError("Custom schema type must have extract() method."); } - let value; + let handle = root; if (selector) { - value = await root.$eval(selector, extract); - } else { - value = await root.evaluate(extract, root); + handle = await root.$(selector); + + if (!handle) { + if (optional) { + return undefined; + } + + throwNotFound(selector); + } } + let value = await handle.evaluate(extract, handle); return convert(value); }, @@ -154,8 +189,8 @@ export const schemaTypes = { * @param {SchemaDef} def The schema definition for the array. * @returns {boolean} A boolean value representing the data. */ - async boolean(root, { selector, convert = identity }) { - const value = await this.string(root, { selector }); + async boolean(root, { selector, optional, convert = identity }) { + const value = await this.string(root, { selector, optional }); return convert(stringToBoolean(value)); }, @@ -165,8 +200,8 @@ export const schemaTypes = { * @param {SchemaDef} def The schema definition for the array. * @returns {number} A number value representing the data. */ - async number(root, { selector, convert = identity }) { - const value = await this.string(root, { selector }); + async number(root, { selector, optional, convert = identity }) { + const value = await this.string(root, { selector, optional }); return convert(stringToNumber(value)); }, @@ -176,8 +211,17 @@ export const schemaTypes = { * @param {ObjectSchemaDef} def The schema definition for the array. * @returns {Object} An object of data matching the definition. */ - async object(root, { selector, properties, convert = identity }) { - const handle = selector ? await root.$$(selector) : root; + async object(root, { selector, optional, properties, convert = identity }) { + const handle = selector ? await root.$(selector) : root; + + if (!handle) { + if (optional) { + return undefined; + } + + throwNotFound(selector); + } + const propertyEntries = Object.entries(properties); const result = {}; @@ -195,17 +239,14 @@ export const schemaTypes = { * @param {SchemaDef} def The schema definition for the array. * @returns {string} A string value representing the data. */ - async string(root, { selector, convert = identity } = {}) { + async string(root, { selector, optional, convert = identity } = {}) { - let value; - - if (selector) { - value = await root.$eval(selector, extractText); - } else { - value = await root.evaluate(extractText, root); - } - - return convert(value); + return this.custom(root, { + selector, + optional, + extract: extractText, + convert + }); }, /** diff --git a/tests/data-extractor.test.js b/tests/data-extractor.test.js index 55f77ef..0f3cda5 100644 --- a/tests/data-extractor.test.js +++ b/tests/data-extractor.test.js @@ -61,6 +61,15 @@ function blockAllBut(page, url) { }); } +/** + * Normalizes an object by passing it through JSON methods. + * @param {Object} data The data to normalize. + * @returns {Object} The normalized data. + */ +function normalizeToJson(data) { + return JSON.parse(JSON.stringify(data)); +} + //----------------------------------------------------------------------------- // Schemas //----------------------------------------------------------------------------- @@ -78,6 +87,11 @@ const salaryPost = { type: "string", selector: "[itemprop=datePublished]" }, + missing: { + type: "string", + selector: "[foo=bar]", + optional: true + }, salaries: { type: "table", selector: "table", @@ -187,7 +201,7 @@ describe("DataExtractor", () => { await page.goto(url); await page.waitForSelector("body"); - const result = await extractor.extractFrom(page); + const result = normalizeToJson(await extractor.extractFrom(page)); expect(result).to.deep.equal(expected); }); });