From 86692c831bc591098dc7dcdf60491453b742da75 Mon Sep 17 00:00:00 2001
From: Andrei Paraschiv <paraschi@adobe.com>
Date: Wed, 14 Aug 2024 14:09:31 +0300
Subject: [PATCH] feat: canonical url audit (#326)

* feat: canonical url audit

* feat: canonical url audit

* feat: canonical url audit

* feat: canonical url audit

* feat: try catch for getTopPages

* feat: try catch for getTopPages

* feat: update main audit fct

* feat: update main audit fct

* feat: add check for top pages retrieval

* feat: get top pages for site

* feat: get top pages for site

* feat: get top pages for site

* feat: get top pages for site

* feat: check canonical tag

* feat: check canonical tag

* feat: check canonical tag

* feat: check canonical tag

* feat: href attribute check

* feat: use auditbuilder

* feat: check input url

* feat: small fix

* feat: jsdom validation

* feat: adding logs

* feat: adding logs

* feat: sanitize html jsdom

* feat: html content log

* feat: test 200pages

* feat: test 200pages

* feat: test 200pages

* feat: test 5pages without sitemap

* feat: test 1page

* feat: test 1page

* feat: testing with more pages

* feat: testing with more pages

* feat: testing with more pages

* feat: a bit of logs

* feat: log every check

* feat: log every check

* feat: log every check

* feat: log every check

* feat: log every check

* feat: log every check

* feat: rename status check

* feat: change output format

* feat: change output format

* feat: change output format

* feat: retrieve pages and add sitemap check

* feat: retrieve pages and add sitemap check

* feat: retrieve pages and add sitemap check

* feat: retrieve pages and add sitemap check

* feat: retrieve pages and add sitemap check

* feat: retrieve pages and add sitemap check

* feat: retrieve pages and add sitemap check

* feat: pr review

* feat: pr review

* feat: add explanation for failed checks

* feat: add explanation for failed checks

* feat: adding tests

* feat: adding tests

* feat: adding tests

* feat: adding tests

* feat: adding tests

* feat: adding tests

* feat: adding tests

* feat: adding tests

* feat: adding tests

* feat: adding tests

* feat: final test

* feat: final test

* feat: remove some of the comments

* feat: pr review update test

* feat: clean up

* fix: trivial log info update
---
 src/canonical/handler.js      | 535 ++++++++++++++++++++++++++++++
 src/index.js                  |   2 +
 test/audits/canonical.test.js | 595 ++++++++++++++++++++++++++++++++++
 3 files changed, 1132 insertions(+)
 create mode 100644 src/canonical/handler.js
 create mode 100644 test/audits/canonical.test.js

diff --git a/src/canonical/handler.js b/src/canonical/handler.js
new file mode 100644
index 00000000..d3308bfa
--- /dev/null
+++ b/src/canonical/handler.js
@@ -0,0 +1,535 @@
+/*
+ * Copyright 2024 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+
+import { JSDOM } from 'jsdom';
+import { fetch } from '../support/utils.js';
+import { AuditBuilder } from '../common/audit-builder.js';
+import { noopUrlResolver } from '../common/audit.js';
+
+export const CANONICAL_CHECKS = Object.freeze({
+  CANONICAL_TAG_EXISTS: {
+    check: 'canonical-tag-exists',
+    explanation: 'The canonical tag is missing, which can lead to duplicate content issues and negatively affect SEO rankings.',
+  },
+  CANONICAL_TAG_ONCE: {
+    check: 'canonical-tag-once',
+    explanation: 'Multiple canonical tags detected, which confuses search engines and can dilute page authority.',
+  },
+  CANONICAL_TAG_NONEMPTY: {
+    check: 'canonical-tag-nonempty',
+    explanation: 'The canonical tag is empty. It should point to the preferred version of the page to avoid content duplication.',
+  },
+  CANONICAL_TAG_IN_HEAD: {
+    check: 'canonical-tag-in-head',
+    explanation: 'The canonical tag must be placed in the head section of the HTML document to ensure it is recognized by search engines.',
+  },
+  CANONICAL_URL_STATUS_OK: {
+    check: 'canonical-url-status-ok',
+    explanation: 'The canonical URL should return a 200 status code to ensure it is accessible and indexable by search engines.',
+  },
+  CANONICAL_URL_NO_REDIRECT: {
+    check: 'canonical-url-no-redirect',
+    explanation: 'The canonical URL should be a direct link without redirects to ensure search engines recognize the intended page.',
+  },
+  CANONICAL_URL_4XX: {
+    check: 'canonical-url-4xx',
+    explanation: 'The canonical URL returns a 4xx error, indicating it is inaccessible, which can harm SEO visibility.',
+  },
+  CANONICAL_URL_5XX: {
+    check: 'canonical-url-5xx',
+    explanation: 'The canonical URL returns a 5xx server error, indicating it is temporarily or permanently unavailable, affecting SEO performance.',
+  },
+  CANONICAL_SELF_REFERENCED: {
+    check: 'canonical-self-referenced',
+    explanation: 'The canonical URL should point to itself to indicate that it is the preferred version of the content.',
+  },
+  CANONICAL_URL_ABSOLUTE: {
+    check: 'canonical-url-absolute',
+    explanation: 'Canonical URLs must be absolute to avoid ambiguity in URL resolution and ensure proper indexing by search engines.',
+  },
+  CANONICAL_URL_SAME_DOMAIN: {
+    check: 'canonical-url-same-domain',
+    explanation: 'The canonical URL should match the domain of the page to avoid signaling to search engines that the content is duplicated elsewhere.',
+  },
+  CANONICAL_URL_SAME_PROTOCOL: {
+    check: 'canonical-url-same-protocol',
+    explanation: 'The canonical URL must use the same protocol (HTTP or HTTPS) as the page to maintain consistency and avoid indexing issues.',
+  },
+  CANONICAL_URL_LOWERCASED: {
+    check: 'canonical-url-lowercased',
+    explanation: 'Canonical URLs should be in lowercase to prevent duplicate content issues since URLs are case-sensitive.',
+  },
+  CANONICAL_URL_FETCH_ERROR: {
+    check: 'canonical-url-fetch-error',
+    explanation: 'There was an error fetching the canonical URL, which prevents validation of the canonical tag.',
+  },
+  CANONICAL_URL_INVALID: {
+    check: 'canonical-url-invalid',
+    explanation: 'The canonical URL is malformed or invalid.',
+  },
+  TOPPAGES: {
+    check: 'top-pages',
+    explanation: 'No top pages found',
+  },
+  URL_UNDEFINED: {
+    check: 'url-defined',
+    explanation: 'The URL is undefined or null, which prevents the canonical tag validation process.',
+  },
+  UNEXPECTED_STATUS_CODE: {
+    check: 'unexpected-status-code',
+    explanation: 'The response returned an unexpected status code, indicating an unforeseen issue with the canonical URL.',
+  },
+});
+
+/**
+ * Retrieves the top pages for a given site.
+ *
+ * @param dataAccess
+ * @param {string} siteId - The page of the site to retrieve the top pages for.
+ * @param {Object} context - The context object containing necessary information.
+ * @param log
+ * @param {Object} context.log - The logging object to log information.
+ * @returns {Promise<Array<Object>>} A promise that resolves to an array of top pages.
+ */
+export async function getTopPagesForSiteId(dataAccess, siteId, context, log) {
+  try {
+    const result = await dataAccess.getTopPagesForSite(siteId, 'ahrefs', 'global');
+    log.info('Received top pages response:', JSON.stringify(result, null, 2));
+
+    const topPages = result || [];
+    if (topPages.length > 0) {
+      const topPagesUrls = topPages.map((page) => ({ url: page.getURL() }));
+      log.info(`Found ${topPagesUrls.length} top pages`);
+      return topPagesUrls;
+    } else {
+      log.info('No top pages found');
+      return [];
+    }
+  } catch (error) {
+    log.error(`Error retrieving top pages for site ${siteId}: ${error.message}`);
+    throw error;
+  }
+}
+
+/**
+ * Validates the canonical tag of a given URL
+ *
+ * @param {string} url - The URL to validate the canonical tag for.
+ * @param {Object} log - The logging object to log information.
+ * @returns {Promise<Object>} An object containing the canonical URL and an array of checks.
+ */
+export async function validateCanonicalTag(url, log) {
+  // in case of undefined or null URL in the 200 top pages list
+  if (!url) {
+    const errorMessage = 'URL is undefined or null';
+    log.error(errorMessage);
+    return {
+      canonicalUrl: null,
+      checks: [{
+        check: CANONICAL_CHECKS.URL_UNDEFINED.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation,
+      }],
+    };
+  }
+
+  try {
+    log.info(`Fetching URL: ${url}`);
+    const response = await fetch(url);
+    const html = await response.text();
+    const dom = new JSDOM(html);
+    const { document } = dom.window;
+
+    const canonicalLinks = document.querySelectorAll('link[rel="canonical"]');
+    const checks = [];
+    let canonicalUrl = null;
+
+    // Check if any canonical tag exists
+    if (canonicalLinks.length === 0) {
+      checks.push({
+        check: CANONICAL_CHECKS.CANONICAL_TAG_EXISTS.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_TAG_EXISTS.explanation,
+      });
+      log.info(`No canonical tag found for URL: ${url}`);
+    } else {
+      checks.push({
+        check: CANONICAL_CHECKS.CANONICAL_TAG_EXISTS.check,
+        success: true,
+      });
+      log.info(`Canonical tag exists for URL: ${url}`);
+    }
+
+    // Proceed with the checks only if there is at least one canonical tag
+    if (canonicalLinks.length > 0) {
+      if (canonicalLinks.length > 1) {
+        checks.push({
+          check: CANONICAL_CHECKS.CANONICAL_TAG_ONCE.check,
+          success: false,
+          explanation: CANONICAL_CHECKS.CANONICAL_TAG_ONCE.explanation,
+        });
+        log.info(`Multiple canonical tags found for URL: ${url}`);
+      } else {
+        const canonicalLink = canonicalLinks[0];
+        const href = canonicalLink.getAttribute('href');
+        if (!href) {
+          checks.push({
+            check: CANONICAL_CHECKS.CANONICAL_TAG_NONEMPTY.check,
+            success: false,
+            explanation: CANONICAL_CHECKS.CANONICAL_TAG_NONEMPTY.explanation,
+          });
+          log.info(`Empty canonical tag found for URL: ${url}`);
+        } else {
+          try {
+            canonicalUrl = href.startsWith('/')
+              ? new URL(href, url).toString()
+              : new URL(href).toString();
+
+            if (!href.endsWith('/') && canonicalUrl.endsWith('/')) {
+              canonicalUrl = canonicalUrl.substring(0, canonicalUrl.length - 1);
+            }
+
+            checks.push({
+              check: CANONICAL_CHECKS.CANONICAL_TAG_NONEMPTY.check,
+              success: true,
+            });
+            if (canonicalUrl === url) {
+              checks.push({
+                check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check,
+                success: true,
+              });
+              log.info(`Canonical URL ${canonicalUrl} references itself`);
+            } else {
+              checks.push({
+                check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check,
+                success: false,
+                explanation: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.explanation,
+              });
+              log.info(`Canonical URL ${canonicalUrl} does not reference itself`);
+            }
+          } catch (error) {
+            checks.push({
+              check: CANONICAL_CHECKS.CANONICAL_URL_INVALID.check,
+              success: false,
+              explanation: CANONICAL_CHECKS.CANONICAL_URL_INVALID.explanation,
+            });
+            log.info(`Invalid canonical URL found for page ${url}`);
+          }
+        }
+
+        // Check if canonical link is in the head section
+        if (!canonicalLink.closest('head')) {
+          checks.push({
+            check: CANONICAL_CHECKS.CANONICAL_TAG_IN_HEAD.check,
+            success: false,
+            explanation: CANONICAL_CHECKS.CANONICAL_TAG_IN_HEAD.explanation,
+          });
+          log.info('Canonical tag is not in the head section');
+        } else {
+          checks.push({
+            check: CANONICAL_CHECKS.CANONICAL_TAG_IN_HEAD.check,
+            success: true,
+          });
+        }
+      }
+    }
+
+    log.info(`Checks: ${JSON.stringify(checks)}`);
+    return { canonicalUrl, checks };
+  } catch (error) {
+    const errorMessage = `Error validating canonical tag for ${url}: ${error.message}`;
+    log.error(errorMessage);
+    return {
+      canonicalUrl: null,
+      checks: [{
+        check: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.explanation,
+      }],
+    };
+  }
+}
+
+/**
+ * Validates the format of a canonical URL against a base URL.
+ *
+ * @param {string} canonicalUrl - The canonical URL to validate.
+ * @param {string} baseUrl - The base URL to compare against.
+ * @param log
+ * @returns {Array<Object>} Array of check results.
+ */
+export function validateCanonicalFormat(canonicalUrl, baseUrl, log) {
+  const checks = [];
+  let base;
+
+  try {
+    base = new URL(baseUrl);
+  } catch (error) {
+    log.error(`Invalid URL: ${baseUrl}`);
+    checks.push({
+      check: CANONICAL_CHECKS.URL_UNDEFINED.check,
+      success: false,
+      explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation,
+    });
+    return checks;
+  }
+
+  // Check if the canonical URL is in lowercase
+  if (canonicalUrl) {
+    if (typeof canonicalUrl === 'string') {
+      if (canonicalUrl !== canonicalUrl.toLowerCase()) {
+        checks.push({
+          check: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.check,
+          success: false,
+          explanation: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.explanation,
+        });
+        log.info(`Canonical URL is not lowercased: ${canonicalUrl}`);
+      } else {
+        checks.push({
+          check: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.check,
+          success: true,
+        });
+      }
+    } else {
+      checks.push({
+        check: CANONICAL_CHECKS.URL_UNDEFINED.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation,
+      });
+      return checks;
+    }
+  }
+
+  // Check if the canonical URL is absolute
+  if (!canonicalUrl.startsWith('http://') && !canonicalUrl.startsWith('https://')) {
+    checks.push({
+      check: CANONICAL_CHECKS.CANONICAL_URL_ABSOLUTE.check,
+      success: false,
+      explanation: CANONICAL_CHECKS.CANONICAL_URL_ABSOLUTE.explanation,
+    });
+    log.info('Canonical URL is not absolute');
+  } else {
+    checks.push({
+      check: CANONICAL_CHECKS.CANONICAL_URL_ABSOLUTE.check,
+      success: true,
+    });
+    let url;
+
+    try {
+      url = new URL(canonicalUrl);
+    } catch (error) {
+      log.error(`Invalid URL: ${canonicalUrl}`);
+      checks.push({
+        check: CANONICAL_CHECKS.URL_UNDEFINED.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation,
+      });
+      return checks;
+    }
+
+    // Check if the canonical URL has the same protocol as the base URL
+    if (!url.href.startsWith(base.protocol)) {
+      checks.push({
+        check: CANONICAL_CHECKS.CANONICAL_URL_SAME_PROTOCOL.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_SAME_PROTOCOL.explanation,
+      });
+      log.info(`Canonical URL  ${canonicalUrl} uses a different protocol than base URL ${baseUrl}`);
+    } else {
+      checks.push({
+        check: CANONICAL_CHECKS.CANONICAL_URL_SAME_PROTOCOL.check,
+        success: true,
+      });
+    }
+
+    // Check if the canonical URL has the same domain as the base URL
+    if (url.hostname !== base.hostname) {
+      checks.push({
+        check: CANONICAL_CHECKS.CANONICAL_URL_SAME_DOMAIN.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_SAME_DOMAIN.explanation,
+      });
+      log.info(`Canonical URL ${canonicalUrl} does not have the same domain as base URL ${baseUrl}`);
+    } else {
+      checks.push({
+        check: CANONICAL_CHECKS.CANONICAL_URL_SAME_DOMAIN.check,
+        success: true,
+      });
+    }
+  }
+
+  return checks;
+}
+
+/**
+ * Recursively validates the contents of a canonical URL.
+ *
+ * @param {string} canonicalUrl - The canonical URL to validate.
+ * @param {Object} log - The logging object to log information.
+ * @param {Set<string>} [visitedUrls=new Set()] - A set of visited URLs to detect redirect loops.
+ * @returns {Promise<Object>} An object with the check result and any error if the check failed.
+ */
+export async function validateCanonicalRecursively(canonicalUrl, log, visitedUrls = new Set()) {
+  const checks = [];
+
+  // Check for redirect loops
+  if (visitedUrls.has(canonicalUrl)) {
+    log.info(`Detected a redirect loop for canonical URL ${canonicalUrl}`);
+    checks.push({
+      check: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check,
+      success: false,
+      explanation: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.explanation,
+    });
+    return checks;
+  }
+
+  // Add the current URL to the visited set
+  visitedUrls.add(canonicalUrl);
+
+  try {
+    const response = await fetch(canonicalUrl, { redirect: 'manual' });
+    if (response.ok) {
+      log.info(`Canonical URL is accessible: ${canonicalUrl}, statusCode: ${response.status}`);
+      checks.push({
+        check: CANONICAL_CHECKS.CANONICAL_URL_STATUS_OK.check,
+        success: true,
+      });
+      checks.push({
+        check: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check,
+        success: true,
+      });
+    } else if ([301, 302, 303, 307, 308].includes(response.status)) {
+      log.info(`Canonical URL ${canonicalUrl} returned a 3xx status: ${response.status}`);
+      checks.push({
+        check: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.explanation,
+      });
+    } else if (response.status >= 400 && response.status < 500) {
+      log.info(`Canonical URL ${canonicalUrl} returned a 4xx error: ${response.status}`);
+      checks.push({
+        check: CANONICAL_CHECKS.CANONICAL_URL_4XX.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_4XX.explanation,
+      });
+    } else if (response.status >= 500) {
+      log.info(`Canonical URL ${canonicalUrl} returned a 5xx error: ${response.status} `);
+      checks.push({
+        check: CANONICAL_CHECKS.CANONICAL_URL_5XX.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_5XX.explanation,
+      });
+    } else {
+      log.info(`Unexpected status code ${response.status} for canonical URL: ${canonicalUrl}`);
+      checks.push({
+        check: CANONICAL_CHECKS.UNEXPECTED_STATUS_CODE.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.UNEXPECTED_STATUS_CODE.explanation,
+      });
+    }
+  } catch (error) {
+    log.error(`Error fetching canonical URL ${canonicalUrl}: ${error.message}`);
+    checks.push({
+      check: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.check,
+      success: false,
+      explanation: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.explanation,
+    });
+  }
+
+  return checks;
+}
+
+/**
+ * Audits the canonical URLs for a given site.
+ *
+ * @param {string} baseURL -- not sure if baseURL like in apex or siteId as we see in logs
+ * @param {Object} context - The context object containing necessary information.
+ * @param {Object} context.log - The logging object to log information.
+ * @param {Object} site
+ * @returns {Promise<Object>} An object containing the audit results.
+ */
+export async function canonicalAuditRunner(baseURL, context, site) {
+  const siteId = site.getId();
+  const { log, dataAccess } = context;
+  log.info(`Starting Canonical Audit with siteId: ${JSON.stringify(siteId)}`);
+
+  try {
+    const topPages = await getTopPagesForSiteId(dataAccess, siteId, context, log);
+    log.info(`Top pages for baseURL ${baseURL}: ${JSON.stringify(topPages)}`);
+
+    if (topPages.length === 0) {
+      log.info('No top pages found, ending audit.');
+      return {
+        fullAuditRef: baseURL,
+        auditResult: {
+          check: CANONICAL_CHECKS.TOPPAGES.check,
+          success: false,
+          explanation: CANONICAL_CHECKS.TOPPAGES.explanation,
+        },
+      };
+    }
+
+    const auditPromises = topPages.map(async (page) => {
+      const { url } = page;
+      const checks = [];
+
+      const { canonicalUrl, checks: canonicalTagChecks } = await validateCanonicalTag(url, log);
+      checks.push(...canonicalTagChecks);
+
+      if (canonicalUrl) {
+        log.info(`Found Canonical URL: ${canonicalUrl}`);
+
+        const urlFormatChecks = validateCanonicalFormat(canonicalUrl, baseURL, log);
+        checks.push(...urlFormatChecks);
+
+        const urlContentCheck = await validateCanonicalRecursively(canonicalUrl, log);
+        checks.push(...urlContentCheck);
+      }
+      return { url, checks };
+    });
+
+    const auditResultsArray = await Promise.allSettled(auditPromises);
+    const aggregatedResults = auditResultsArray.reduce((acc, result) => {
+      if (result.status === 'fulfilled') {
+        const { url, checks } = result.value;
+        checks.forEach((check) => {
+          const { check: checkType, success, error } = check;
+          if (!acc[checkType]) {
+            acc[checkType] = { success, error, url: [] };
+          }
+          acc[checkType].url.push(url);
+        });
+      }
+      return acc;
+    }, {});
+
+    log.info(`Successfully completed Canonical Audit for site: ${baseURL}`);
+
+    return {
+      fullAuditRef: baseURL,
+      auditResult: aggregatedResults,
+    };
+  } catch (error) {
+    return {
+      fullAuditRef: baseURL,
+      auditResult: {
+        error: `Audit failed with error: ${error.message}`,
+        success: false,
+      },
+    };
+  }
+}
+
+export default new AuditBuilder()
+  .withUrlResolver(noopUrlResolver)
+  .withRunner(canonicalAuditRunner)
+  .build();
diff --git a/src/index.js b/src/index.js
index 541782a3..a51ef9b0 100644
--- a/src/index.js
+++ b/src/index.js
@@ -23,6 +23,7 @@ import lhsDesktop from './lhs/handler-desktop.js';
 import lhsMobile from './lhs/handler-mobile.js';
 import notfound from './notfound/handler.js';
 import sitemap from './sitemap/handler.js';
+import canonical from './canonical/handler.js';
 import backlinks from './backlinks/handler.js';
 import experimentation from './experimentation/handler.js';
 import conversion from './conversion/handler.js';
@@ -36,6 +37,7 @@ const HANDLERS = {
   'lhs-desktop': lhsDesktop,
   404: notfound,
   sitemap,
+  canonical,
   'broken-backlinks': backlinks,
   experimentation,
   conversion,
diff --git a/test/audits/canonical.test.js b/test/audits/canonical.test.js
new file mode 100644
index 00000000..1583d11c
--- /dev/null
+++ b/test/audits/canonical.test.js
@@ -0,0 +1,595 @@
+/*
+ * Copyright 2024 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+
+/* eslint-env mocha */
+
+import chai from 'chai';
+import chaiAsPromised from 'chai-as-promised';
+import sinon from 'sinon';
+import sinonChai from 'sinon-chai';
+import nock from 'nock';
+import {
+  getTopPagesForSiteId, validateCanonicalTag, validateCanonicalFormat,
+  validateCanonicalRecursively, canonicalAuditRunner, CANONICAL_CHECKS,
+} from '../../src/canonical/handler.js';
+
+chai.use(sinonChai);
+chai.use(chaiAsPromised);
+const { expect } = chai;
+
+describe('Canonical URL Tests', () => {
+  let log;
+  beforeEach(() => {
+    log = {
+      info: sinon.stub(),
+      error: sinon.stub(),
+    };
+  });
+
+  afterEach(() => {
+    sinon.restore();
+  });
+
+  describe('getTopPagesForSiteId', () => {
+    it('should return top pages for a given site ID', async () => {
+      const dataAccess = {
+        getTopPagesForSite: sinon.stub().resolves([{ getURL: () => 'http://example.com/page1' }]),
+      };
+      const siteId = 'testSiteId';
+      const context = { log };
+
+      const result = await getTopPagesForSiteId(dataAccess, siteId, context, log);
+
+      expect(result).to.deep.equal([{ url: 'http://example.com/page1' }]);
+      expect(log.info).to.have.been.called;
+    });
+
+    it('should handle null result and return an empty array', async () => {
+      const dataAccess = {
+        getTopPagesForSite: sinon.stub().resolves(null),
+      };
+      const siteId = 'testSiteId';
+      const context = { log };
+
+      const result = await getTopPagesForSiteId(dataAccess, siteId, context, log);
+
+      expect(result).to.deep.equal([]);
+      expect(log.info).to.have.been.calledWith('No top pages found');
+    });
+
+    it('should log the error and propagate the exception when retrieving top pages fails', async () => {
+      const dataAccess = {
+        getTopPagesForSite: sinon.stub().rejects(new Error('Test error')),
+      };
+      const siteId = 'testSiteId';
+      const context = { log };
+
+      try {
+        await getTopPagesForSiteId(dataAccess, siteId, context, log);
+      } catch (error) {
+        expect(error.message).to.equal('Test error');
+      }
+
+      expect(log.error).to.have.been.calledWith('Error retrieving top pages for site testSiteId: Test error');
+    });
+
+    it('should log and return an empty array if no top pages are found', async () => {
+      const dataAccess = {
+        getTopPagesForSite: sinon.stub().resolves([]),
+      };
+      const siteId = 'testSiteId';
+      const context = { log };
+
+      const result = await getTopPagesForSiteId(dataAccess, siteId, context, log);
+
+      expect(result).to.deep.equal([]);
+      expect(log.info).to.have.been.calledWith('No top pages found');
+    });
+  });
+
+  describe('validateCanonicalTag', () => {
+    it('should handle missing canonical tag', async () => {
+      const url = 'http://example.com';
+      const html = '<!DOCTYPE html><html><head></head><body></body></html>';
+      nock('http://example.com').get('/').reply(200, html);
+
+      const result = await validateCanonicalTag(url, log);
+
+      expect(result.canonicalUrl).to.be.null;
+      expect(result.checks).to.deep.include({
+        check: 'canonical-tag-exists',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_TAG_EXISTS.explanation,
+      });
+      expect(log.info).to.have.been.called;
+    });
+
+    it('should handle invalid base URL correctly', () => {
+      const canonicalUrl = 'https://example.com';
+      const baseUrl = 'invalid-url';
+      const result = validateCanonicalFormat(canonicalUrl, baseUrl, log);
+
+      expect(result).to.deep.include({
+        check: 'url-defined',
+        success: false,
+        explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation,
+      });
+      expect(log.error).to.have.been.calledWith(`Invalid URL: ${baseUrl}`);
+    });
+
+    it('should return an error when URL is undefined or null', async () => {
+      const result = await validateCanonicalTag(null, log);
+
+      expect(result.canonicalUrl).to.be.null;
+      expect(result.checks).to.deep.include({
+        check: 'url-defined',
+        success: false,
+        explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation,
+      });
+      expect(log.error).to.have.been.calledWith('URL is undefined or null');
+    });
+
+    it('should handle fetch error', async () => {
+      const url = 'http://example.com';
+      nock('http://example.com').get('/').replyWithError('Test error');
+
+      const result = await validateCanonicalTag(url, log);
+
+      expect(result.canonicalUrl).to.be.null;
+      expect(result.checks).to.deep.include({
+        check: 'canonical-url-fetch-error',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.explanation,
+      });
+    });
+
+    it('should handle invalid canonical URL correctly', async () => {
+      const url = 'http://example.com';
+      const html = '<html><head><link rel="canonical" href="invalid-url"></head><body></body></html>';
+      nock(url).get('/').reply(200, html);
+
+      const result = await validateCanonicalTag(url, log);
+
+      expect(result.checks).to.deep.include({
+        check: 'canonical-url-invalid',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_INVALID.explanation,
+      });
+      expect(log.info).to.have.been.calledWith('Invalid canonical URL found for page http://example.com');
+    });
+
+    it('should handle empty canonical tag', async () => {
+      const url = 'http://example.com';
+      const html = '<html><head><link rel="canonical" href=""></head><body></body></html>';
+      nock(url).get('/').reply(200, html);
+
+      const result = await validateCanonicalTag(url, log);
+
+      expect(result.canonicalUrl).to.be.null;
+      expect(result.checks).to.deep.include({
+        check: 'canonical-tag-nonempty',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_TAG_NONEMPTY.explanation,
+      });
+      expect(log.info).to.have.been.calledWith(`Empty canonical tag found for URL: ${url}`);
+    });
+
+    it('should handle multiple canonical tags', async () => {
+      const url = 'http://example.com';
+      const html = '<html><head><link rel="canonical" href="http://example.com/page1"><link rel="canonical" href="http://example.com/page2"></head><body></body></html>';
+      nock(url).get('/').reply(200, html);
+
+      const result = await validateCanonicalTag(url, log);
+
+      expect(result.checks).to.deep.include({
+        check: 'canonical-tag-once',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_TAG_ONCE.explanation,
+      });
+    });
+
+    it('should fail if the canonical tag is not in the head section', async () => {
+      const url = 'http://example.com';
+      const html = '<html><head></head><body><link rel="canonical" href="http://example.com"></body></html>';
+      nock(url).get('/').reply(200, html);
+
+      const result = await validateCanonicalTag(url, log);
+
+      expect(result.checks).to.deep.include({
+        check: 'canonical-tag-in-head',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_TAG_IN_HEAD.explanation,
+      });
+      expect(log.info).to.have.been.calledWith('Canonical tag is not in the head section');
+    });
+  });
+
+  describe('validateCanonicalUrlFormat', () => {
+    it('should validate canonical URL format successfully', () => {
+      const canonicalUrl = 'https://example.com/page';
+      const baseUrl = 'https://example.com';
+
+      const result = validateCanonicalFormat(canonicalUrl, baseUrl, log);
+
+      expect(result).to.deep.include.members([
+        { check: 'canonical-url-absolute', success: true },
+        { check: 'canonical-url-same-protocol', success: true },
+        { check: 'canonical-url-same-domain', success: true },
+      ]);
+    });
+
+    it('should handle invalid canonical URL', () => {
+      const canonicalUrl = {};
+      const baseUrl = 'http://example.com';
+      const result = validateCanonicalFormat(canonicalUrl, baseUrl, log);
+
+      expect(result).to.deep.include.members([{
+        check: 'url-defined',
+        success: false,
+        explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation,
+      }]);
+    });
+
+    it('should handle invalid base URL', () => {
+      const canonicalUrl = 'https://example.com';
+      const baseUrl = 'invalid-url';
+      const result = validateCanonicalFormat(canonicalUrl, baseUrl, log);
+
+      expect(result).to.deep.include({
+        check: 'url-defined',
+        success: false,
+        explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation,
+      });
+      expect(log.error).to.have.been.calledWith('Invalid URL: invalid-url');
+    });
+
+    it('should handle non-lowercase canonical URL', () => {
+      const canonicalUrl = 'https://example.com/UpperCase';
+      const baseUrl = 'https://example.com';
+      const result = validateCanonicalFormat(canonicalUrl, baseUrl, log);
+
+      expect(result).to.deep.include({
+        check: 'canonical-url-lowercased',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.explanation,
+      });
+      expect(log.info).to.have.been.calledWith('Canonical URL is not lowercased: https://example.com/UpperCase');
+    });
+
+    it('should pass if canonical URL is in lowercase', () => {
+      const canonicalUrl = 'https://example.com/lowercase';
+      const baseUrl = 'https://example.com';
+
+      const result = validateCanonicalFormat(canonicalUrl, baseUrl, log);
+
+      expect(result).to.deep.include({
+        check: 'canonical-url-lowercased',
+        success: true,
+      });
+    });
+
+    it('should handle redirection scenario and stop at the first redirect', async () => {
+      const canonicalUrl = 'http://example.com/page1';
+      const redirectUrl = 'http://example.com/page2';
+
+      nock('http://example.com')
+        .get('/page1')
+        .reply(301, null, { Location: redirectUrl });
+
+      nock('http://example.com')
+        .get('/page2')
+        .reply(200);
+
+      const result = await validateCanonicalRecursively(canonicalUrl, log, new Set());
+
+      expect(result).to.deep.include.members([
+        {
+          check: 'canonical-url-no-redirect',
+          success: false,
+          explanation: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.explanation,
+        },
+      ]);
+    });
+
+    it('should handle different domains', () => {
+      const canonicalUrl = 'https://another.com';
+      const baseUrl = 'https://example.com';
+      const result = validateCanonicalFormat(canonicalUrl, baseUrl, log);
+
+      expect(result).to.deep.include({
+        check: 'canonical-url-same-domain',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_SAME_DOMAIN.explanation,
+      });
+      expect(log.info).to.have.been.calledWith('Canonical URL https://another.com does not have the same domain as base URL https://example.com');
+    });
+
+    it('should handle different protocols', () => {
+      const canonicalUrl = 'https://example.com';
+      const baseUrl = 'http://example.com';
+      const result = validateCanonicalFormat(canonicalUrl, baseUrl, log);
+
+      expect(result).to.deep.include({
+        check: 'canonical-url-same-protocol',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_SAME_PROTOCOL.explanation,
+      });
+      expect(log.info).to.have.been.calledWith('Canonical URL  https://example.com uses a different protocol than base URL http://example.com');
+    });
+
+    it('should fail if the canonical URL is not absolute', () => {
+      const canonicalUrl = '/relative/url';
+      const baseUrl = 'http://example.com';
+
+      const result = validateCanonicalFormat(canonicalUrl, baseUrl, log);
+
+      expect(result).to.deep.include({
+        check: 'canonical-url-absolute',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_ABSOLUTE.explanation,
+      });
+    });
+
+    it('should pass if the canonical URL points to itself', async () => {
+      const url = 'http://example.com';
+      const html = `<html><head><link rel="canonical" href="${url}"></head><body></body></html>`;
+      nock(url).get('/').reply(200, html);
+
+      const result = await validateCanonicalTag(url, log);
+
+      expect(result.checks).to.deep.include.members([
+        {
+          check: 'canonical-tag-nonempty',
+          success: true,
+        },
+        {
+          check: 'canonical-tag-exists',
+          success: true,
+        }]);
+      expect(log.info).to.have.been.calledWith(`Canonical URL ${url} references itself`);
+    });
+
+    it('should handle try-catch for invalid canonical URL', () => {
+      const invalidCanonicalUrl = 'http://%';
+      const baseUrl = 'https://example.com';
+
+      const result = validateCanonicalFormat(invalidCanonicalUrl, baseUrl, log);
+
+      expect(result).to.deep.include.members([{
+        check: CANONICAL_CHECKS.CANONICAL_URL_ABSOLUTE.check,
+        success: true,
+      }]);
+
+      expect(result).to.deep.include.members([{
+        check: CANONICAL_CHECKS.URL_UNDEFINED.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation,
+      }]);
+
+      expect(log.error).to.have.been.calledWith(`Invalid URL: ${invalidCanonicalUrl}`);
+    });
+
+    it('should fail if the canonical URL does not point to itself', async () => {
+      const url = 'http://example.com';
+      const canonicalUrl = 'http://example.com/other-page';
+      const html = `<html><head><link rel="canonical" href="${canonicalUrl}"></head><body></body></html>`;
+      nock(url).get('/').reply(200, html);
+
+      const result = await validateCanonicalTag(url, log);
+
+      expect(result.checks).to.deep.include.members([{
+        check: 'canonical-tag-nonempty',
+        success: true,
+      }]);
+      expect(result.checks).to.deep.include.members([{
+        check: 'canonical-self-referenced',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.explanation,
+      }]);
+      expect(log.info).to.have.been.calledWith(`Canonical URL ${canonicalUrl} does not reference itself`);
+    });
+  });
+
+  describe('validateCanonicalRecursively', () => {
+    it('should validate canonical URL contents successfully', async () => {
+      const canonicalUrl = 'http://example.com/page';
+      nock('http://example.com').get('/page').reply(200);
+
+      const result = await validateCanonicalRecursively(canonicalUrl, log);
+
+      expect(result).to.deep.include({ check: 'canonical-url-status-ok', success: true });
+      expect(result).to.deep.include({ check: 'canonical-url-no-redirect', success: true });
+    });
+
+    it('should handle a fetch error correctly', async () => {
+      const canonicalUrl = 'http://example.com/fetcherror';
+      nock('http://example.com').get('/fetcherror').replyWithError('Network error');
+
+      const result = await validateCanonicalRecursively(canonicalUrl, log);
+
+      expect(result).to.deep.include({
+        check: 'canonical-url-fetch-error',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.explanation,
+      });
+      expect(log.error).to.have.been.calledWith(`Error fetching canonical URL ${canonicalUrl}: Network error`);
+    });
+
+    it('should detect and handle redirect loop correctly', async () => {
+      const canonicalUrl = 'http://example.com/redirect-loop';
+      const visitedUrls = new Set([canonicalUrl]);
+
+      const result = await validateCanonicalRecursively(canonicalUrl, log, visitedUrls);
+
+      expect(result).to.deep.include({
+        check: 'canonical-url-no-redirect',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.explanation,
+      });
+      expect(log.info).to.have.been.calledWith(`Detected a redirect loop for canonical URL ${canonicalUrl}`);
+    });
+
+    it('should handle 4xx error response correctly', async () => {
+      const canonicalUrl = 'http://example.com/404';
+      nock('http://example.com').get('/404').reply(404);
+
+      const result = await validateCanonicalRecursively(canonicalUrl, log);
+
+      expect(result).to.deep.include({
+        check: 'canonical-url-4xx',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_4XX.explanation,
+      });
+      expect(log.info).to.have.been.calledWith(`Canonical URL ${canonicalUrl} returned a 4xx error: 404`);
+    });
+
+    it('should handle 5xx error response correctly', async () => {
+      const canonicalUrl = 'http://example.com/500';
+      nock('http://example.com').get('/500').reply(500);
+
+      const result = await validateCanonicalRecursively(canonicalUrl, log);
+
+      expect(result).to.deep.include({
+        check: 'canonical-url-5xx',
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_URL_5XX.explanation,
+      });
+    });
+
+    it('should correctly resolve relative canonical URL with base URL', async () => {
+      const url = 'https://example.com/some-page';
+      const href = '/canonical-page';
+      const expectedCanonicalUrl = 'https://example.com/canonical-page';
+
+      const html = `
+    <html>
+      <head>
+        <link rel="canonical" href="${href}">
+      </head>
+      <body>
+        <h1>Test Page</h1>
+      </body>
+    </html>
+  `;
+
+      nock('https://example.com')
+        .get('/some-page')
+        .reply(200, html);
+
+      const result = await validateCanonicalTag(url, log);
+
+      // ensure that the resolved canonical URL is correct
+      expect(result.canonicalUrl).to.equal(expectedCanonicalUrl);
+      expect(result.checks).to.deep.include({
+        check: CANONICAL_CHECKS.CANONICAL_TAG_NONEMPTY.check,
+        success: true,
+      });
+      expect(result.checks).to.deep.include({
+        check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check,
+        success: false,
+        explanation: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.explanation,
+      });
+      expect(log.info).to.have.been.calledWith(`Canonical URL ${expectedCanonicalUrl} does not reference itself`);
+    });
+
+    it('should handle unexpected status code response correctly', async () => {
+      const canonicalUrl = 'http://example.com/300';
+      nock('http://example.com').get('/300').reply(300);
+
+      const result = await validateCanonicalRecursively(canonicalUrl, log);
+
+      expect(result).to.deep.include({
+        check: 'unexpected-status-code',
+        success: false,
+        explanation: CANONICAL_CHECKS.UNEXPECTED_STATUS_CODE.explanation,
+      });
+      expect(log.info).to.have.been.calledWith(`Unexpected status code 300 for canonical URL: ${canonicalUrl}`);
+    });
+  });
+
+  describe('canonicalAuditRunner', () => {
+    it('should run canonical audit successfully', async () => {
+      const baseURL = 'http://example.com';
+      const html = `<html><head><link rel="canonical" href="${baseURL}"></head><body></body></html>`;
+
+      nock('http://example.com').get('/page1').reply(200, html);
+      nock(baseURL).get('/').reply(200, html);
+      const getTopPagesForSiteStub = sinon.stub().resolves([{ getURL: () => 'http://example.com/page1' }]);
+
+      const context = {
+        log,
+        dataAccess: {
+          getTopPagesForSite: getTopPagesForSiteStub,
+        },
+      };
+      const site = { getId: () => 'testSiteId' };
+
+      const result = await canonicalAuditRunner(baseURL, context, site);
+
+      expect(result).to.be.an('object');
+      expect(result.auditResult).to.have.all.keys(
+        'canonical-self-referenced',
+        'canonical-tag-exists',
+        'canonical-tag-in-head',
+        'canonical-tag-nonempty',
+        'canonical-url-absolute',
+        'canonical-url-lowercased',
+        'canonical-url-same-domain',
+        'canonical-url-same-protocol',
+        'canonical-url-no-redirect',
+        'canonical-url-status-ok',
+      );
+      expect(getTopPagesForSiteStub).to.have.been.calledOnceWith('testSiteId', 'ahrefs', 'global');
+      expect(log.info).to.have.been.called;
+    });
+
+    it('should return early and log a message when no top pages are found', async () => {
+      const baseURL = 'http://example.com';
+      const context = {
+        log,
+        dataAccess: {
+          getTopPagesForSite: sinon.stub().resolves([]),
+        },
+      };
+      const site = { getId: () => 'testSiteId' };
+
+      const result = await canonicalAuditRunner(baseURL, context, site);
+
+      expect(result).to.deep.equal({
+        fullAuditRef: baseURL,
+        auditResult: {
+          check: 'top-pages',
+          success: false,
+          explanation: CANONICAL_CHECKS.TOPPAGES.explanation,
+        },
+      });
+      expect(log.info).to.have.been.calledWith('No top pages found, ending audit.');
+    });
+
+    it('should log a simplified error and return a failed audit result if an exception occurs', async () => {
+      const baseURL = 'http://example.com';
+      const context = { log, dataAccess: { getTopPagesForSite: sinon.stub().rejects(new Error('Test Error')) } };
+      const site = { getId: () => 'testSiteId' };
+
+      const result = await canonicalAuditRunner(baseURL, context, site);
+
+      // verify that the returned audit result indicates a failure with an error message
+      expect(result).to.deep.equal({
+        fullAuditRef: baseURL,
+        auditResult: {
+          error: 'Audit failed with error: Test Error',
+          success: false,
+        },
+      });
+    });
+  });
+});