Skip to content

Commit

Permalink
Refactor breaking the code into multiple files (#2)
Browse files Browse the repository at this point in the history
* moved cli arg handling and screen diff code to their own files

* moving crawling code to its own file

* bumping version number
  • Loading branch information
ktabors authored Dec 7, 2021
1 parent 26635b3 commit f38c96d
Show file tree
Hide file tree
Showing 5 changed files with 329 additions and 284 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "docs-differ",
"version": "1.0.0",
"version": "1.0.1",
"description": "diffs two websites, built for react-spectrum docs site",
"license": "Apache-2.0",
"main": "src/index.js",
Expand Down
96 changes: 96 additions & 0 deletions src/args.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// command line options
const baselineCommandParam = '-b';
const currentCommandParam = '-c';
const screenshotMaxCommandParam = '-s';
const disableMobileCommandParam = '-m';
const disableDesktopCommandParam = '-d';
const clusterSizeCommandParam = '-k';
const quietLoggingCommandParam = '-q';

const defaultValues = {
screenshotLimit: -1,
disableDesktopScreenshots: false,
disableMobileScreenshots: false,
clusterMaxConcurrency: 10,
verboseLogMessages: true
};

/**
* Gets the urls to compare from the command line args.
*/
function getUrls() {
let myArgs = process.argv.slice(2);

let urls = [];
if (myArgs.includes(baselineCommandParam)) {
urls.push(myArgs[myArgs.indexOf(baselineCommandParam) + 1]);
}
if (myArgs.includes(currentCommandParam)) {
urls.push(myArgs[myArgs.indexOf(currentCommandParam) + 1]);
}

return urls;
}

// used in two places
function logUsage() {
console.log(`Usage: 'node src/index.js ${baselineCommandParam} <baseline site url> ${currentCommandParam} <current site to diff against baseline url>'`);
console.log(`Other options include ${disableMobileCommandParam}, ${disableDesktopCommandParam} and ${screenshotMaxCommandParam} <integer>`);
console.log(` ${disableMobileCommandParam} disable mobile screenshots, default false`);
console.log(` ${disableDesktopCommandParam} disable desktop screenshots, default false`);
console.log(` ${screenshotMaxCommandParam} limit screenshots taken for all possible to this number`);
console.log(` ${clusterSizeCommandParam} max cluster size for concurrency, default 10`);
console.log(` ${quietLoggingCommandParam} quiets some log messages`);
}

/**
* Set the command line args:
* - disable mobile screenshots
* - disable desktop screenshots
* - max number of screenshots
*/
function processArgs() {
let myArgs = process.argv.slice(2);
let {
screenshotLimit,
clusterMaxConcurrency,
disableMobileScreenshots,
disableDesktopScreenshots,
verboseLogMessages
} = defaultValues;

if (myArgs.includes(screenshotMaxCommandParam)) {
screenshotLimit = parseInt(myArgs[myArgs.indexOf(screenshotMaxCommandParam) + 1], 10);
if (isNaN(screenshotLimit)) {
logUsage();
process.exit(1);
} else if (screenshotLimit <= 0) {
screenshotLimit = -1;
}
}
if (myArgs.includes(clusterSizeCommandParam)) {
clusterMaxConcurrency = parseInt(myArgs[myArgs.indexOf(clusterSizeCommandParam) + 1], 10);
if (isNaN(clusterMaxConcurrency)) {
logUsage();
process.exit(1);
} else if (clusterMaxConcurrency <= 0) {
clusterMaxConcurrency = 10;
}
}
if (myArgs.includes(disableMobileCommandParam)) {
disableMobileScreenshots = true;
}
if (myArgs.includes(disableDesktopCommandParam)) {
disableDesktopScreenshots = true;
}
if (myArgs.includes(quietLoggingCommandParam)) {
verboseLogMessages = false;
}

return {screenshotLimit, clusterMaxConcurrency, disableMobileScreenshots, disableDesktopScreenshots, verboseLogMessages};
}

exports.getUrls = getUrls;
exports.logUsage = logUsage;
exports.processArgs = processArgs;
exports.defaultParamValues = defaultValues;
194 changes: 194 additions & 0 deletions src/crawl.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
const {Cluster} = require('puppeteer-cluster');
const {defaultParamValues} = require('./args');
const mkdirp = require('mkdirp');
const path = require('path');
const puppeteer = require('puppeteer');
const URL = require('url').URL;

const puppeteeriPhone11 = puppeteer.devices['iPhone 11'];

// global cluster variable to avoid passing it around
let cluster;

let visitedBaseline = {};
let visitedCurrent = {};
let badUrls = [];

// global variables for args
let {
screenshotLimit,
disableDesktopScreenshots,
disableMobileScreenshots,
verboseLogMessages
} = defaultParamValues;

/**
* Setup puppeteer cluster for parallelization and crawl sites.
*/
async function setupClusterAndCrawl({baselineDir, currentDir, urls, ...argValues}) {
screenshotLimit = argValues.screenshotLimit;
disableDesktopScreenshots = argValues.disableDesktopScreenshots;
disableMobileScreenshots = argValues.disableMobileScreenshots;
verboseLogMessages = argValues.verboseLogMessages;

try {
cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: argValues.clusterMaxConcurrency
});

// Event handler to catch and log cluster task errors
cluster.on('taskerror', (err, data) => {
console.log(`Error crawling ${data}: ${err.message}`);
});

// triggering the screenshot scrapping of the two sites
await walkUrl(urls[0], visitedBaseline, baselineDir);
await walkUrl(urls[1], visitedCurrent, currentDir);

await cluster.idle();
await cluster.close();
console.log('queue empty and cluster closed');
console.timeLog('executionTime');

// URLs that might be bad
if (badUrls.length > 0) {
console.log('Are these URLs bad?', badUrls);
}
} catch (e) {
exitCode = 1;
throw e;
} finally {
// cluster cleanup in case there is an exception
(await cluster).idle();
(await cluster).close();
}
}

/**
* Figures out the root URI and recursively walks the site.
*/
async function walkUrl(url, visited, storageDirectory) {
mkdirp.sync(storageDirectory);

// last index is to create the root path for CI urls
let lastIndex = new URL(url).pathname.lastIndexOf('/');
await walk(url, new URL(url).pathname.substring(0, lastIndex), visited, storageDirectory);
}

/**
* Checks is a URL has been visited, if it hasn't it visits it and takes a
* screenshot. Visiting also means getting all of its links and calling this
* function, walk, on all the links that are within this site. It ignores
* external links.
*/
async function walk(href, rootPath, visited, storageDirectory) {
let url = new URL(href);
url.hash = '';
if (visited[url.pathname]) {
return;
}

if (screenshotLimit !== -1 && Object.keys(visited).length >= screenshotLimit) {
return;
}

/* two things
* - for the CI builds we have extra pathname, this removes that
* - the screenshot tool doesn't handle subdirectories so we include those in
* the file names with ~~ indicating where a directory slash was
*/
let filename = url.pathname.replace(rootPath, '').slice(1).split('.')[0].replace('/', '~~');
visited[url.pathname] = [`${filename}_desktop.png`, `${filename}_mobile.png`];

await cluster.task(async ({ page, data: url }) => {
await crawlPage(page, url);
});

// passing an object instead of a url string for all the params like storage
// location, already visited and filename
cluster.queue({
filename: filename,
rootPath: rootPath,
storageDirectory: storageDirectory,
url: url.toString(),
visited: visited
});
}

/**
* moved the logic of page.goto, screenshot, and href walking out of walk()
* because this was is the claster.task() logic and walk is cluster management
* and checking if things are visited.
*/
async function crawlPage(page, {filename, rootPath, storageDirectory, url, visited}) {
let hrefs = [];
let i = 0;

try {
if (verboseLogMessages) {
console.log('visiting: ', url);
}
let response = await page.goto(url);
while (!response.ok() && i < 5) {
if (verboseLogMessages) {
console.log('trying again', url, filename, rootPath, storageDirectory);
}
let response = await page.goto(url);
i++;
}

await screenshot(page, `${storageDirectory}/${filename}`);
} catch (e) {
console.log('error with screenshot: ', url.toString(), e);
}

hrefs = await page.$$eval('a[href]', as => as.map(a => a.href));

let parentUrl = new URL(url).host;
for (let href of hrefs) {
let u = new URL(href, page.url());
if (u.host === parentUrl) {
// skip walking patterns of URLs that cause issues
if ((rootPath.length > 0 && href.indexOf(rootPath) === -1) || u.pathname.indexOf('.html') === -1) {
badUrls.push({
parentUrl: url,
badUrl: href
})
} else {
await walk(href, rootPath, visited, storageDirectory);
}
}
}
}

/**
* Talks a full page screenshot of the current most common browser viewport and
* a simulated iPhone 11 for the mobile rendering.
*/
async function screenshot(page, filename) {
if (!disableDesktopScreenshots) {
await page.setViewport({
width: 1366,
height: 784
});
// this seems to handle screenshot issues, might need to increase as we use this
await page.waitForTimeout(100);

await page.screenshot({
path: `${filename}_desktop.png`,
fullPage: true
});
}

if (!disableMobileScreenshots) {
await page.emulate(puppeteeriPhone11);

await page.screenshot({
path: `${filename}_mobile.png`,
fullPage: true
});
}
}

exports.setupClusterAndCrawl = setupClusterAndCrawl;
Loading

0 comments on commit f38c96d

Please sign in to comment.