Refactor breaking the code into multiple files (#2)

* moved cli arg handling and screen diff code to their own files * moving crawling code to its own file * bumping version number
ktabors · Dec 7, 2021 · f38c96d · f38c96d
1 parent 26635b3
commit f38c96d
Show file tree

Hide file tree

Showing 5 changed files with 329 additions and 284 deletions.
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "docs-differ",
-  "version": "1.0.0",
+  "version": "1.0.1",
   "description": "diffs two websites, built for react-spectrum docs site",
   "license": "Apache-2.0",
   "main": "src/index.js",

diff --git a/src/args.js b/src/args.js
@@ -0,0 +1,96 @@
+// command line options
+const baselineCommandParam = '-b';
+const currentCommandParam = '-c';
+const screenshotMaxCommandParam = '-s';
+const disableMobileCommandParam = '-m';
+const disableDesktopCommandParam = '-d';
+const clusterSizeCommandParam = '-k';
+const quietLoggingCommandParam = '-q';
+
+const defaultValues = {
+  screenshotLimit: -1,
+  disableDesktopScreenshots: false,
+  disableMobileScreenshots: false,
+  clusterMaxConcurrency: 10,
+  verboseLogMessages: true
+};
+
+/**
+ * Gets the urls to compare from the command line args.
+ */
+function getUrls() {
+  let myArgs = process.argv.slice(2);
+
+  let urls = [];
+  if (myArgs.includes(baselineCommandParam)) {
+    urls.push(myArgs[myArgs.indexOf(baselineCommandParam) + 1]);
+  }
+  if (myArgs.includes(currentCommandParam)) {
+    urls.push(myArgs[myArgs.indexOf(currentCommandParam) + 1]);
+  }
+
+  return urls;
+}
+
+// used in two places
+function logUsage() {
+  console.log(`Usage: 'node src/index.js ${baselineCommandParam} <baseline site url> ${currentCommandParam} <current site to diff against baseline url>'`);
+  console.log(`Other options include ${disableMobileCommandParam}, ${disableDesktopCommandParam} and ${screenshotMaxCommandParam} <integer>`);
+  console.log(`  ${disableMobileCommandParam} disable mobile screenshots, default false`);
+  console.log(`  ${disableDesktopCommandParam} disable desktop screenshots, default false`);
+  console.log(`  ${screenshotMaxCommandParam} limit screenshots taken for all possible to this number`);
+  console.log(`  ${clusterSizeCommandParam} max cluster size for concurrency, default 10`);
+  console.log(`  ${quietLoggingCommandParam} quiets some log messages`);
+}
+
+/**
+ * Set the command line args:
+ *   - disable mobile screenshots
+ *   - disable desktop screenshots
+ *   - max number of screenshots
+ */
+function processArgs() {
+  let myArgs = process.argv.slice(2);
+  let {
+    screenshotLimit,
+    clusterMaxConcurrency,
+    disableMobileScreenshots,
+    disableDesktopScreenshots,
+    verboseLogMessages
+  } = defaultValues;
+
+  if (myArgs.includes(screenshotMaxCommandParam)) {
+    screenshotLimit = parseInt(myArgs[myArgs.indexOf(screenshotMaxCommandParam) + 1], 10);
+    if (isNaN(screenshotLimit)) {
+      logUsage();
+      process.exit(1);
+    } else if (screenshotLimit <= 0) {
+      screenshotLimit = -1;
+    }
+  }
+  if (myArgs.includes(clusterSizeCommandParam)) {
+    clusterMaxConcurrency = parseInt(myArgs[myArgs.indexOf(clusterSizeCommandParam) + 1], 10);
+    if (isNaN(clusterMaxConcurrency)) {
+      logUsage();
+      process.exit(1);
+    } else if (clusterMaxConcurrency <= 0) {
+      clusterMaxConcurrency = 10;
+    }
+  }
+  if (myArgs.includes(disableMobileCommandParam)) {
+    disableMobileScreenshots = true;
+  }
+  if (myArgs.includes(disableDesktopCommandParam)) {
+    disableDesktopScreenshots = true;
+  }
+  if (myArgs.includes(quietLoggingCommandParam)) {
+    verboseLogMessages = false;
+  }
+
+  return {screenshotLimit, clusterMaxConcurrency, disableMobileScreenshots, disableDesktopScreenshots, verboseLogMessages};
+}
+
+exports.getUrls = getUrls;
+exports.logUsage = logUsage;
+exports.processArgs = processArgs;
+exports.defaultParamValues = defaultValues;
diff --git a/src/crawl.js b/src/crawl.js
@@ -0,0 +1,194 @@
+const {Cluster} = require('puppeteer-cluster');
+const {defaultParamValues} = require('./args');
+const mkdirp = require('mkdirp');
+const path = require('path');
+const puppeteer = require('puppeteer');
+const URL = require('url').URL;
+
+const puppeteeriPhone11 = puppeteer.devices['iPhone 11'];
+
+// global cluster variable to avoid passing it around
+let cluster;
+
+let visitedBaseline = {};
+let visitedCurrent = {};
+let badUrls = [];
+
+// global variables for args
+let {
+  screenshotLimit,
+  disableDesktopScreenshots,
+  disableMobileScreenshots,
+  verboseLogMessages
+} = defaultParamValues;
+
+/**
+ * Setup puppeteer cluster for parallelization and crawl sites.
+ */
+async function setupClusterAndCrawl({baselineDir, currentDir, urls, ...argValues}) {
+  screenshotLimit = argValues.screenshotLimit;
+  disableDesktopScreenshots = argValues.disableDesktopScreenshots;
+  disableMobileScreenshots = argValues.disableMobileScreenshots;
+  verboseLogMessages = argValues.verboseLogMessages;
+
+  try {
+    cluster = await Cluster.launch({
+      concurrency: Cluster.CONCURRENCY_CONTEXT,
+      maxConcurrency: argValues.clusterMaxConcurrency
+    });
+
+    // Event handler to catch and log cluster task errors
+    cluster.on('taskerror', (err, data) => {
+      console.log(`Error crawling ${data}: ${err.message}`);
+    });
+
+    // triggering the screenshot scrapping of the two sites
+    await walkUrl(urls[0], visitedBaseline, baselineDir);
+    await walkUrl(urls[1], visitedCurrent, currentDir);
+
+    await cluster.idle();
+    await cluster.close();
+    console.log('queue empty and cluster closed');
+    console.timeLog('executionTime');
+
+    // URLs that might be bad
+    if (badUrls.length > 0) {
+      console.log('Are these URLs bad?', badUrls);
+    }
+  } catch (e) {
+    exitCode = 1;
+    throw e;
+  } finally {
+    // cluster cleanup in case there is an exception
+    (await cluster).idle();
+    (await cluster).close();
+  }
+}
+
+/**
+ * Figures out the root URI and recursively walks the site.
+ */
+async function walkUrl(url, visited, storageDirectory) {
+  mkdirp.sync(storageDirectory);
+
+  // last index is to create the root path for CI urls
+  let lastIndex = new URL(url).pathname.lastIndexOf('/');
+  await walk(url, new URL(url).pathname.substring(0, lastIndex), visited, storageDirectory);
+}
+
+/**
+ * Checks is a URL has been visited, if it hasn't it visits it and takes a
+ * screenshot. Visiting also means getting all of its links and calling this
+ * function, walk, on all the links that are within this site. It ignores
+ * external links.
+ */
+async function walk(href, rootPath, visited, storageDirectory) {
+  let url = new URL(href);
+  url.hash = '';
+  if (visited[url.pathname]) {
+    return;
+  }
+
+  if (screenshotLimit !== -1 && Object.keys(visited).length >= screenshotLimit) {
+    return;
+  }
+
+  /* two things
+   * - for the CI builds we have extra pathname, this removes that
+   * - the screenshot tool doesn't handle subdirectories so we include those in
+   *   the file names with ~~ indicating where a directory slash was
+   */
+  let filename = url.pathname.replace(rootPath, '').slice(1).split('.')[0].replace('/', '~~');
+  visited[url.pathname] = [`${filename}_desktop.png`, `${filename}_mobile.png`];
+
+  await cluster.task(async ({ page, data: url }) => {
+    await crawlPage(page, url);
+  });
+
+  // passing an object instead of a url string for all the params like storage
+  // location, already visited and filename
+  cluster.queue({
+    filename: filename,
+    rootPath: rootPath,
+    storageDirectory: storageDirectory,
+    url: url.toString(),
+    visited: visited
+  });
+}
+
+/**
+ * moved the logic of page.goto, screenshot, and href walking out of walk()
+ * because this was is the claster.task() logic and walk is cluster management
+ * and checking if things are visited.
+ */
+async function crawlPage(page, {filename, rootPath, storageDirectory, url, visited}) {
+  let hrefs = [];
+  let i = 0;
+
+  try {
+    if (verboseLogMessages) {
+      console.log('visiting: ', url);
+    }
+    let response = await page.goto(url);
+    while (!response.ok() && i < 5) {
+      if (verboseLogMessages) {
+        console.log('trying again', url, filename, rootPath, storageDirectory);
+      }
+      let response = await page.goto(url);
+      i++;
+    }
+
+    await screenshot(page, `${storageDirectory}/${filename}`);
+  } catch (e) {
+    console.log('error with screenshot: ', url.toString(), e);
+  }
+
+  hrefs = await page.$$eval('a[href]', as => as.map(a => a.href));
+
+  let parentUrl = new URL(url).host;
+  for (let href of hrefs) {
+    let u = new URL(href, page.url());
+    if (u.host === parentUrl) {
+      // skip walking patterns of URLs that cause issues
+      if ((rootPath.length > 0 && href.indexOf(rootPath) === -1) || u.pathname.indexOf('.html') === -1) {
+        badUrls.push({
+          parentUrl: url,
+          badUrl: href
+        })
+      } else {
+        await walk(href, rootPath, visited, storageDirectory);
+      }
+    }
+  }
+}
+
+/**
+ * Talks a full page screenshot of the current most common browser viewport and
+ * a simulated iPhone 11 for the mobile rendering.
+ */
+async function screenshot(page, filename) {
+  if (!disableDesktopScreenshots) {
+    await page.setViewport({
+      width: 1366,
+      height: 784
+    });
+    // this seems to handle screenshot issues, might need to increase as we use this
+    await page.waitForTimeout(100);
+
+    await page.screenshot({
+      path: `${filename}_desktop.png`,
+      fullPage: true
+    });
+  }
+
+  if (!disableMobileScreenshots) {
+    await page.emulate(puppeteeriPhone11);
+
+    await page.screenshot({
+      path: `${filename}_mobile.png`,
+      fullPage: true
+    });
+  }
+}
+
+exports.setupClusterAndCrawl = setupClusterAndCrawl;