From 86692c831bc591098dc7dcdf60491453b742da75 Mon Sep 17 00:00:00 2001 From: Andrei Paraschiv Date: Wed, 14 Aug 2024 14:09:31 +0300 Subject: [PATCH] feat: canonical url audit (#326) * feat: canonical url audit * feat: canonical url audit * feat: canonical url audit * feat: canonical url audit * feat: try catch for getTopPages * feat: try catch for getTopPages * feat: update main audit fct * feat: update main audit fct * feat: add check for top pages retrieval * feat: get top pages for site * feat: get top pages for site * feat: get top pages for site * feat: get top pages for site * feat: check canonical tag * feat: check canonical tag * feat: check canonical tag * feat: check canonical tag * feat: href attribute check * feat: use auditbuilder * feat: check input url * feat: small fix * feat: jsdom validation * feat: adding logs * feat: adding logs * feat: sanitize html jsdom * feat: html content log * feat: test 200pages * feat: test 200pages * feat: test 200pages * feat: test 5pages without sitemap * feat: test 1page * feat: test 1page * feat: testing with more pages * feat: testing with more pages * feat: testing with more pages * feat: a bit of logs * feat: log every check * feat: log every check * feat: log every check * feat: log every check * feat: log every check * feat: log every check * feat: rename status check * feat: change output format * feat: change output format * feat: change output format * feat: retrieve pages and add sitemap check * feat: retrieve pages and add sitemap check * feat: retrieve pages and add sitemap check * feat: retrieve pages and add sitemap check * feat: retrieve pages and add sitemap check * feat: retrieve pages and add sitemap check * feat: retrieve pages and add sitemap check * feat: pr review * feat: pr review * feat: add explanation for failed checks * feat: add explanation for failed checks * feat: adding tests * feat: adding tests * feat: adding tests * feat: adding tests * feat: adding tests * feat: adding tests * feat: adding tests * feat: adding tests * feat: adding tests * feat: adding tests * feat: final test * feat: final test * feat: remove some of the comments * feat: pr review update test * feat: clean up * fix: trivial log info update --- src/canonical/handler.js | 535 ++++++++++++++++++++++++++++++ src/index.js | 2 + test/audits/canonical.test.js | 595 ++++++++++++++++++++++++++++++++++ 3 files changed, 1132 insertions(+) create mode 100644 src/canonical/handler.js create mode 100644 test/audits/canonical.test.js diff --git a/src/canonical/handler.js b/src/canonical/handler.js new file mode 100644 index 00000000..d3308bfa --- /dev/null +++ b/src/canonical/handler.js @@ -0,0 +1,535 @@ +/* + * Copyright 2024 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import { JSDOM } from 'jsdom'; +import { fetch } from '../support/utils.js'; +import { AuditBuilder } from '../common/audit-builder.js'; +import { noopUrlResolver } from '../common/audit.js'; + +export const CANONICAL_CHECKS = Object.freeze({ + CANONICAL_TAG_EXISTS: { + check: 'canonical-tag-exists', + explanation: 'The canonical tag is missing, which can lead to duplicate content issues and negatively affect SEO rankings.', + }, + CANONICAL_TAG_ONCE: { + check: 'canonical-tag-once', + explanation: 'Multiple canonical tags detected, which confuses search engines and can dilute page authority.', + }, + CANONICAL_TAG_NONEMPTY: { + check: 'canonical-tag-nonempty', + explanation: 'The canonical tag is empty. It should point to the preferred version of the page to avoid content duplication.', + }, + CANONICAL_TAG_IN_HEAD: { + check: 'canonical-tag-in-head', + explanation: 'The canonical tag must be placed in the head section of the HTML document to ensure it is recognized by search engines.', + }, + CANONICAL_URL_STATUS_OK: { + check: 'canonical-url-status-ok', + explanation: 'The canonical URL should return a 200 status code to ensure it is accessible and indexable by search engines.', + }, + CANONICAL_URL_NO_REDIRECT: { + check: 'canonical-url-no-redirect', + explanation: 'The canonical URL should be a direct link without redirects to ensure search engines recognize the intended page.', + }, + CANONICAL_URL_4XX: { + check: 'canonical-url-4xx', + explanation: 'The canonical URL returns a 4xx error, indicating it is inaccessible, which can harm SEO visibility.', + }, + CANONICAL_URL_5XX: { + check: 'canonical-url-5xx', + explanation: 'The canonical URL returns a 5xx server error, indicating it is temporarily or permanently unavailable, affecting SEO performance.', + }, + CANONICAL_SELF_REFERENCED: { + check: 'canonical-self-referenced', + explanation: 'The canonical URL should point to itself to indicate that it is the preferred version of the content.', + }, + CANONICAL_URL_ABSOLUTE: { + check: 'canonical-url-absolute', + explanation: 'Canonical URLs must be absolute to avoid ambiguity in URL resolution and ensure proper indexing by search engines.', + }, + CANONICAL_URL_SAME_DOMAIN: { + check: 'canonical-url-same-domain', + explanation: 'The canonical URL should match the domain of the page to avoid signaling to search engines that the content is duplicated elsewhere.', + }, + CANONICAL_URL_SAME_PROTOCOL: { + check: 'canonical-url-same-protocol', + explanation: 'The canonical URL must use the same protocol (HTTP or HTTPS) as the page to maintain consistency and avoid indexing issues.', + }, + CANONICAL_URL_LOWERCASED: { + check: 'canonical-url-lowercased', + explanation: 'Canonical URLs should be in lowercase to prevent duplicate content issues since URLs are case-sensitive.', + }, + CANONICAL_URL_FETCH_ERROR: { + check: 'canonical-url-fetch-error', + explanation: 'There was an error fetching the canonical URL, which prevents validation of the canonical tag.', + }, + CANONICAL_URL_INVALID: { + check: 'canonical-url-invalid', + explanation: 'The canonical URL is malformed or invalid.', + }, + TOPPAGES: { + check: 'top-pages', + explanation: 'No top pages found', + }, + URL_UNDEFINED: { + check: 'url-defined', + explanation: 'The URL is undefined or null, which prevents the canonical tag validation process.', + }, + UNEXPECTED_STATUS_CODE: { + check: 'unexpected-status-code', + explanation: 'The response returned an unexpected status code, indicating an unforeseen issue with the canonical URL.', + }, +}); + +/** + * Retrieves the top pages for a given site. + * + * @param dataAccess + * @param {string} siteId - The page of the site to retrieve the top pages for. + * @param {Object} context - The context object containing necessary information. + * @param log + * @param {Object} context.log - The logging object to log information. + * @returns {Promise>} A promise that resolves to an array of top pages. + */ +export async function getTopPagesForSiteId(dataAccess, siteId, context, log) { + try { + const result = await dataAccess.getTopPagesForSite(siteId, 'ahrefs', 'global'); + log.info('Received top pages response:', JSON.stringify(result, null, 2)); + + const topPages = result || []; + if (topPages.length > 0) { + const topPagesUrls = topPages.map((page) => ({ url: page.getURL() })); + log.info(`Found ${topPagesUrls.length} top pages`); + return topPagesUrls; + } else { + log.info('No top pages found'); + return []; + } + } catch (error) { + log.error(`Error retrieving top pages for site ${siteId}: ${error.message}`); + throw error; + } +} + +/** + * Validates the canonical tag of a given URL + * + * @param {string} url - The URL to validate the canonical tag for. + * @param {Object} log - The logging object to log information. + * @returns {Promise} An object containing the canonical URL and an array of checks. + */ +export async function validateCanonicalTag(url, log) { + // in case of undefined or null URL in the 200 top pages list + if (!url) { + const errorMessage = 'URL is undefined or null'; + log.error(errorMessage); + return { + canonicalUrl: null, + checks: [{ + check: CANONICAL_CHECKS.URL_UNDEFINED.check, + success: false, + explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation, + }], + }; + } + + try { + log.info(`Fetching URL: ${url}`); + const response = await fetch(url); + const html = await response.text(); + const dom = new JSDOM(html); + const { document } = dom.window; + + const canonicalLinks = document.querySelectorAll('link[rel="canonical"]'); + const checks = []; + let canonicalUrl = null; + + // Check if any canonical tag exists + if (canonicalLinks.length === 0) { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_TAG_EXISTS.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_TAG_EXISTS.explanation, + }); + log.info(`No canonical tag found for URL: ${url}`); + } else { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_TAG_EXISTS.check, + success: true, + }); + log.info(`Canonical tag exists for URL: ${url}`); + } + + // Proceed with the checks only if there is at least one canonical tag + if (canonicalLinks.length > 0) { + if (canonicalLinks.length > 1) { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_TAG_ONCE.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_TAG_ONCE.explanation, + }); + log.info(`Multiple canonical tags found for URL: ${url}`); + } else { + const canonicalLink = canonicalLinks[0]; + const href = canonicalLink.getAttribute('href'); + if (!href) { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_TAG_NONEMPTY.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_TAG_NONEMPTY.explanation, + }); + log.info(`Empty canonical tag found for URL: ${url}`); + } else { + try { + canonicalUrl = href.startsWith('/') + ? new URL(href, url).toString() + : new URL(href).toString(); + + if (!href.endsWith('/') && canonicalUrl.endsWith('/')) { + canonicalUrl = canonicalUrl.substring(0, canonicalUrl.length - 1); + } + + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_TAG_NONEMPTY.check, + success: true, + }); + if (canonicalUrl === url) { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check, + success: true, + }); + log.info(`Canonical URL ${canonicalUrl} references itself`); + } else { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.explanation, + }); + log.info(`Canonical URL ${canonicalUrl} does not reference itself`); + } + } catch (error) { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_INVALID.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_INVALID.explanation, + }); + log.info(`Invalid canonical URL found for page ${url}`); + } + } + + // Check if canonical link is in the head section + if (!canonicalLink.closest('head')) { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_TAG_IN_HEAD.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_TAG_IN_HEAD.explanation, + }); + log.info('Canonical tag is not in the head section'); + } else { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_TAG_IN_HEAD.check, + success: true, + }); + } + } + } + + log.info(`Checks: ${JSON.stringify(checks)}`); + return { canonicalUrl, checks }; + } catch (error) { + const errorMessage = `Error validating canonical tag for ${url}: ${error.message}`; + log.error(errorMessage); + return { + canonicalUrl: null, + checks: [{ + check: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.explanation, + }], + }; + } +} + +/** + * Validates the format of a canonical URL against a base URL. + * + * @param {string} canonicalUrl - The canonical URL to validate. + * @param {string} baseUrl - The base URL to compare against. + * @param log + * @returns {Array} Array of check results. + */ +export function validateCanonicalFormat(canonicalUrl, baseUrl, log) { + const checks = []; + let base; + + try { + base = new URL(baseUrl); + } catch (error) { + log.error(`Invalid URL: ${baseUrl}`); + checks.push({ + check: CANONICAL_CHECKS.URL_UNDEFINED.check, + success: false, + explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation, + }); + return checks; + } + + // Check if the canonical URL is in lowercase + if (canonicalUrl) { + if (typeof canonicalUrl === 'string') { + if (canonicalUrl !== canonicalUrl.toLowerCase()) { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.explanation, + }); + log.info(`Canonical URL is not lowercased: ${canonicalUrl}`); + } else { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.check, + success: true, + }); + } + } else { + checks.push({ + check: CANONICAL_CHECKS.URL_UNDEFINED.check, + success: false, + explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation, + }); + return checks; + } + } + + // Check if the canonical URL is absolute + if (!canonicalUrl.startsWith('http://') && !canonicalUrl.startsWith('https://')) { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_ABSOLUTE.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_ABSOLUTE.explanation, + }); + log.info('Canonical URL is not absolute'); + } else { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_ABSOLUTE.check, + success: true, + }); + let url; + + try { + url = new URL(canonicalUrl); + } catch (error) { + log.error(`Invalid URL: ${canonicalUrl}`); + checks.push({ + check: CANONICAL_CHECKS.URL_UNDEFINED.check, + success: false, + explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation, + }); + return checks; + } + + // Check if the canonical URL has the same protocol as the base URL + if (!url.href.startsWith(base.protocol)) { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_SAME_PROTOCOL.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_SAME_PROTOCOL.explanation, + }); + log.info(`Canonical URL ${canonicalUrl} uses a different protocol than base URL ${baseUrl}`); + } else { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_SAME_PROTOCOL.check, + success: true, + }); + } + + // Check if the canonical URL has the same domain as the base URL + if (url.hostname !== base.hostname) { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_SAME_DOMAIN.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_SAME_DOMAIN.explanation, + }); + log.info(`Canonical URL ${canonicalUrl} does not have the same domain as base URL ${baseUrl}`); + } else { + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_SAME_DOMAIN.check, + success: true, + }); + } + } + + return checks; +} + +/** + * Recursively validates the contents of a canonical URL. + * + * @param {string} canonicalUrl - The canonical URL to validate. + * @param {Object} log - The logging object to log information. + * @param {Set} [visitedUrls=new Set()] - A set of visited URLs to detect redirect loops. + * @returns {Promise} An object with the check result and any error if the check failed. + */ +export async function validateCanonicalRecursively(canonicalUrl, log, visitedUrls = new Set()) { + const checks = []; + + // Check for redirect loops + if (visitedUrls.has(canonicalUrl)) { + log.info(`Detected a redirect loop for canonical URL ${canonicalUrl}`); + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.explanation, + }); + return checks; + } + + // Add the current URL to the visited set + visitedUrls.add(canonicalUrl); + + try { + const response = await fetch(canonicalUrl, { redirect: 'manual' }); + if (response.ok) { + log.info(`Canonical URL is accessible: ${canonicalUrl}, statusCode: ${response.status}`); + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_STATUS_OK.check, + success: true, + }); + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check, + success: true, + }); + } else if ([301, 302, 303, 307, 308].includes(response.status)) { + log.info(`Canonical URL ${canonicalUrl} returned a 3xx status: ${response.status}`); + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.explanation, + }); + } else if (response.status >= 400 && response.status < 500) { + log.info(`Canonical URL ${canonicalUrl} returned a 4xx error: ${response.status}`); + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_4XX.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_4XX.explanation, + }); + } else if (response.status >= 500) { + log.info(`Canonical URL ${canonicalUrl} returned a 5xx error: ${response.status} `); + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_5XX.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_5XX.explanation, + }); + } else { + log.info(`Unexpected status code ${response.status} for canonical URL: ${canonicalUrl}`); + checks.push({ + check: CANONICAL_CHECKS.UNEXPECTED_STATUS_CODE.check, + success: false, + explanation: CANONICAL_CHECKS.UNEXPECTED_STATUS_CODE.explanation, + }); + } + } catch (error) { + log.error(`Error fetching canonical URL ${canonicalUrl}: ${error.message}`); + checks.push({ + check: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.explanation, + }); + } + + return checks; +} + +/** + * Audits the canonical URLs for a given site. + * + * @param {string} baseURL -- not sure if baseURL like in apex or siteId as we see in logs + * @param {Object} context - The context object containing necessary information. + * @param {Object} context.log - The logging object to log information. + * @param {Object} site + * @returns {Promise} An object containing the audit results. + */ +export async function canonicalAuditRunner(baseURL, context, site) { + const siteId = site.getId(); + const { log, dataAccess } = context; + log.info(`Starting Canonical Audit with siteId: ${JSON.stringify(siteId)}`); + + try { + const topPages = await getTopPagesForSiteId(dataAccess, siteId, context, log); + log.info(`Top pages for baseURL ${baseURL}: ${JSON.stringify(topPages)}`); + + if (topPages.length === 0) { + log.info('No top pages found, ending audit.'); + return { + fullAuditRef: baseURL, + auditResult: { + check: CANONICAL_CHECKS.TOPPAGES.check, + success: false, + explanation: CANONICAL_CHECKS.TOPPAGES.explanation, + }, + }; + } + + const auditPromises = topPages.map(async (page) => { + const { url } = page; + const checks = []; + + const { canonicalUrl, checks: canonicalTagChecks } = await validateCanonicalTag(url, log); + checks.push(...canonicalTagChecks); + + if (canonicalUrl) { + log.info(`Found Canonical URL: ${canonicalUrl}`); + + const urlFormatChecks = validateCanonicalFormat(canonicalUrl, baseURL, log); + checks.push(...urlFormatChecks); + + const urlContentCheck = await validateCanonicalRecursively(canonicalUrl, log); + checks.push(...urlContentCheck); + } + return { url, checks }; + }); + + const auditResultsArray = await Promise.allSettled(auditPromises); + const aggregatedResults = auditResultsArray.reduce((acc, result) => { + if (result.status === 'fulfilled') { + const { url, checks } = result.value; + checks.forEach((check) => { + const { check: checkType, success, error } = check; + if (!acc[checkType]) { + acc[checkType] = { success, error, url: [] }; + } + acc[checkType].url.push(url); + }); + } + return acc; + }, {}); + + log.info(`Successfully completed Canonical Audit for site: ${baseURL}`); + + return { + fullAuditRef: baseURL, + auditResult: aggregatedResults, + }; + } catch (error) { + return { + fullAuditRef: baseURL, + auditResult: { + error: `Audit failed with error: ${error.message}`, + success: false, + }, + }; + } +} + +export default new AuditBuilder() + .withUrlResolver(noopUrlResolver) + .withRunner(canonicalAuditRunner) + .build(); diff --git a/src/index.js b/src/index.js index 541782a3..a51ef9b0 100644 --- a/src/index.js +++ b/src/index.js @@ -23,6 +23,7 @@ import lhsDesktop from './lhs/handler-desktop.js'; import lhsMobile from './lhs/handler-mobile.js'; import notfound from './notfound/handler.js'; import sitemap from './sitemap/handler.js'; +import canonical from './canonical/handler.js'; import backlinks from './backlinks/handler.js'; import experimentation from './experimentation/handler.js'; import conversion from './conversion/handler.js'; @@ -36,6 +37,7 @@ const HANDLERS = { 'lhs-desktop': lhsDesktop, 404: notfound, sitemap, + canonical, 'broken-backlinks': backlinks, experimentation, conversion, diff --git a/test/audits/canonical.test.js b/test/audits/canonical.test.js new file mode 100644 index 00000000..1583d11c --- /dev/null +++ b/test/audits/canonical.test.js @@ -0,0 +1,595 @@ +/* + * Copyright 2024 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +/* eslint-env mocha */ + +import chai from 'chai'; +import chaiAsPromised from 'chai-as-promised'; +import sinon from 'sinon'; +import sinonChai from 'sinon-chai'; +import nock from 'nock'; +import { + getTopPagesForSiteId, validateCanonicalTag, validateCanonicalFormat, + validateCanonicalRecursively, canonicalAuditRunner, CANONICAL_CHECKS, +} from '../../src/canonical/handler.js'; + +chai.use(sinonChai); +chai.use(chaiAsPromised); +const { expect } = chai; + +describe('Canonical URL Tests', () => { + let log; + beforeEach(() => { + log = { + info: sinon.stub(), + error: sinon.stub(), + }; + }); + + afterEach(() => { + sinon.restore(); + }); + + describe('getTopPagesForSiteId', () => { + it('should return top pages for a given site ID', async () => { + const dataAccess = { + getTopPagesForSite: sinon.stub().resolves([{ getURL: () => 'http://example.com/page1' }]), + }; + const siteId = 'testSiteId'; + const context = { log }; + + const result = await getTopPagesForSiteId(dataAccess, siteId, context, log); + + expect(result).to.deep.equal([{ url: 'http://example.com/page1' }]); + expect(log.info).to.have.been.called; + }); + + it('should handle null result and return an empty array', async () => { + const dataAccess = { + getTopPagesForSite: sinon.stub().resolves(null), + }; + const siteId = 'testSiteId'; + const context = { log }; + + const result = await getTopPagesForSiteId(dataAccess, siteId, context, log); + + expect(result).to.deep.equal([]); + expect(log.info).to.have.been.calledWith('No top pages found'); + }); + + it('should log the error and propagate the exception when retrieving top pages fails', async () => { + const dataAccess = { + getTopPagesForSite: sinon.stub().rejects(new Error('Test error')), + }; + const siteId = 'testSiteId'; + const context = { log }; + + try { + await getTopPagesForSiteId(dataAccess, siteId, context, log); + } catch (error) { + expect(error.message).to.equal('Test error'); + } + + expect(log.error).to.have.been.calledWith('Error retrieving top pages for site testSiteId: Test error'); + }); + + it('should log and return an empty array if no top pages are found', async () => { + const dataAccess = { + getTopPagesForSite: sinon.stub().resolves([]), + }; + const siteId = 'testSiteId'; + const context = { log }; + + const result = await getTopPagesForSiteId(dataAccess, siteId, context, log); + + expect(result).to.deep.equal([]); + expect(log.info).to.have.been.calledWith('No top pages found'); + }); + }); + + describe('validateCanonicalTag', () => { + it('should handle missing canonical tag', async () => { + const url = 'http://example.com'; + const html = ''; + nock('http://example.com').get('/').reply(200, html); + + const result = await validateCanonicalTag(url, log); + + expect(result.canonicalUrl).to.be.null; + expect(result.checks).to.deep.include({ + check: 'canonical-tag-exists', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_TAG_EXISTS.explanation, + }); + expect(log.info).to.have.been.called; + }); + + it('should handle invalid base URL correctly', () => { + const canonicalUrl = 'https://example.com'; + const baseUrl = 'invalid-url'; + const result = validateCanonicalFormat(canonicalUrl, baseUrl, log); + + expect(result).to.deep.include({ + check: 'url-defined', + success: false, + explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation, + }); + expect(log.error).to.have.been.calledWith(`Invalid URL: ${baseUrl}`); + }); + + it('should return an error when URL is undefined or null', async () => { + const result = await validateCanonicalTag(null, log); + + expect(result.canonicalUrl).to.be.null; + expect(result.checks).to.deep.include({ + check: 'url-defined', + success: false, + explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation, + }); + expect(log.error).to.have.been.calledWith('URL is undefined or null'); + }); + + it('should handle fetch error', async () => { + const url = 'http://example.com'; + nock('http://example.com').get('/').replyWithError('Test error'); + + const result = await validateCanonicalTag(url, log); + + expect(result.canonicalUrl).to.be.null; + expect(result.checks).to.deep.include({ + check: 'canonical-url-fetch-error', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.explanation, + }); + }); + + it('should handle invalid canonical URL correctly', async () => { + const url = 'http://example.com'; + const html = ''; + nock(url).get('/').reply(200, html); + + const result = await validateCanonicalTag(url, log); + + expect(result.checks).to.deep.include({ + check: 'canonical-url-invalid', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_INVALID.explanation, + }); + expect(log.info).to.have.been.calledWith('Invalid canonical URL found for page http://example.com'); + }); + + it('should handle empty canonical tag', async () => { + const url = 'http://example.com'; + const html = ''; + nock(url).get('/').reply(200, html); + + const result = await validateCanonicalTag(url, log); + + expect(result.canonicalUrl).to.be.null; + expect(result.checks).to.deep.include({ + check: 'canonical-tag-nonempty', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_TAG_NONEMPTY.explanation, + }); + expect(log.info).to.have.been.calledWith(`Empty canonical tag found for URL: ${url}`); + }); + + it('should handle multiple canonical tags', async () => { + const url = 'http://example.com'; + const html = ''; + nock(url).get('/').reply(200, html); + + const result = await validateCanonicalTag(url, log); + + expect(result.checks).to.deep.include({ + check: 'canonical-tag-once', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_TAG_ONCE.explanation, + }); + }); + + it('should fail if the canonical tag is not in the head section', async () => { + const url = 'http://example.com'; + const html = ''; + nock(url).get('/').reply(200, html); + + const result = await validateCanonicalTag(url, log); + + expect(result.checks).to.deep.include({ + check: 'canonical-tag-in-head', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_TAG_IN_HEAD.explanation, + }); + expect(log.info).to.have.been.calledWith('Canonical tag is not in the head section'); + }); + }); + + describe('validateCanonicalUrlFormat', () => { + it('should validate canonical URL format successfully', () => { + const canonicalUrl = 'https://example.com/page'; + const baseUrl = 'https://example.com'; + + const result = validateCanonicalFormat(canonicalUrl, baseUrl, log); + + expect(result).to.deep.include.members([ + { check: 'canonical-url-absolute', success: true }, + { check: 'canonical-url-same-protocol', success: true }, + { check: 'canonical-url-same-domain', success: true }, + ]); + }); + + it('should handle invalid canonical URL', () => { + const canonicalUrl = {}; + const baseUrl = 'http://example.com'; + const result = validateCanonicalFormat(canonicalUrl, baseUrl, log); + + expect(result).to.deep.include.members([{ + check: 'url-defined', + success: false, + explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation, + }]); + }); + + it('should handle invalid base URL', () => { + const canonicalUrl = 'https://example.com'; + const baseUrl = 'invalid-url'; + const result = validateCanonicalFormat(canonicalUrl, baseUrl, log); + + expect(result).to.deep.include({ + check: 'url-defined', + success: false, + explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation, + }); + expect(log.error).to.have.been.calledWith('Invalid URL: invalid-url'); + }); + + it('should handle non-lowercase canonical URL', () => { + const canonicalUrl = 'https://example.com/UpperCase'; + const baseUrl = 'https://example.com'; + const result = validateCanonicalFormat(canonicalUrl, baseUrl, log); + + expect(result).to.deep.include({ + check: 'canonical-url-lowercased', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.explanation, + }); + expect(log.info).to.have.been.calledWith('Canonical URL is not lowercased: https://example.com/UpperCase'); + }); + + it('should pass if canonical URL is in lowercase', () => { + const canonicalUrl = 'https://example.com/lowercase'; + const baseUrl = 'https://example.com'; + + const result = validateCanonicalFormat(canonicalUrl, baseUrl, log); + + expect(result).to.deep.include({ + check: 'canonical-url-lowercased', + success: true, + }); + }); + + it('should handle redirection scenario and stop at the first redirect', async () => { + const canonicalUrl = 'http://example.com/page1'; + const redirectUrl = 'http://example.com/page2'; + + nock('http://example.com') + .get('/page1') + .reply(301, null, { Location: redirectUrl }); + + nock('http://example.com') + .get('/page2') + .reply(200); + + const result = await validateCanonicalRecursively(canonicalUrl, log, new Set()); + + expect(result).to.deep.include.members([ + { + check: 'canonical-url-no-redirect', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.explanation, + }, + ]); + }); + + it('should handle different domains', () => { + const canonicalUrl = 'https://another.com'; + const baseUrl = 'https://example.com'; + const result = validateCanonicalFormat(canonicalUrl, baseUrl, log); + + expect(result).to.deep.include({ + check: 'canonical-url-same-domain', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_SAME_DOMAIN.explanation, + }); + expect(log.info).to.have.been.calledWith('Canonical URL https://another.com does not have the same domain as base URL https://example.com'); + }); + + it('should handle different protocols', () => { + const canonicalUrl = 'https://example.com'; + const baseUrl = 'http://example.com'; + const result = validateCanonicalFormat(canonicalUrl, baseUrl, log); + + expect(result).to.deep.include({ + check: 'canonical-url-same-protocol', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_SAME_PROTOCOL.explanation, + }); + expect(log.info).to.have.been.calledWith('Canonical URL https://example.com uses a different protocol than base URL http://example.com'); + }); + + it('should fail if the canonical URL is not absolute', () => { + const canonicalUrl = '/relative/url'; + const baseUrl = 'http://example.com'; + + const result = validateCanonicalFormat(canonicalUrl, baseUrl, log); + + expect(result).to.deep.include({ + check: 'canonical-url-absolute', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_ABSOLUTE.explanation, + }); + }); + + it('should pass if the canonical URL points to itself', async () => { + const url = 'http://example.com'; + const html = ``; + nock(url).get('/').reply(200, html); + + const result = await validateCanonicalTag(url, log); + + expect(result.checks).to.deep.include.members([ + { + check: 'canonical-tag-nonempty', + success: true, + }, + { + check: 'canonical-tag-exists', + success: true, + }]); + expect(log.info).to.have.been.calledWith(`Canonical URL ${url} references itself`); + }); + + it('should handle try-catch for invalid canonical URL', () => { + const invalidCanonicalUrl = 'http://%'; + const baseUrl = 'https://example.com'; + + const result = validateCanonicalFormat(invalidCanonicalUrl, baseUrl, log); + + expect(result).to.deep.include.members([{ + check: CANONICAL_CHECKS.CANONICAL_URL_ABSOLUTE.check, + success: true, + }]); + + expect(result).to.deep.include.members([{ + check: CANONICAL_CHECKS.URL_UNDEFINED.check, + success: false, + explanation: CANONICAL_CHECKS.URL_UNDEFINED.explanation, + }]); + + expect(log.error).to.have.been.calledWith(`Invalid URL: ${invalidCanonicalUrl}`); + }); + + it('should fail if the canonical URL does not point to itself', async () => { + const url = 'http://example.com'; + const canonicalUrl = 'http://example.com/other-page'; + const html = ``; + nock(url).get('/').reply(200, html); + + const result = await validateCanonicalTag(url, log); + + expect(result.checks).to.deep.include.members([{ + check: 'canonical-tag-nonempty', + success: true, + }]); + expect(result.checks).to.deep.include.members([{ + check: 'canonical-self-referenced', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.explanation, + }]); + expect(log.info).to.have.been.calledWith(`Canonical URL ${canonicalUrl} does not reference itself`); + }); + }); + + describe('validateCanonicalRecursively', () => { + it('should validate canonical URL contents successfully', async () => { + const canonicalUrl = 'http://example.com/page'; + nock('http://example.com').get('/page').reply(200); + + const result = await validateCanonicalRecursively(canonicalUrl, log); + + expect(result).to.deep.include({ check: 'canonical-url-status-ok', success: true }); + expect(result).to.deep.include({ check: 'canonical-url-no-redirect', success: true }); + }); + + it('should handle a fetch error correctly', async () => { + const canonicalUrl = 'http://example.com/fetcherror'; + nock('http://example.com').get('/fetcherror').replyWithError('Network error'); + + const result = await validateCanonicalRecursively(canonicalUrl, log); + + expect(result).to.deep.include({ + check: 'canonical-url-fetch-error', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_FETCH_ERROR.explanation, + }); + expect(log.error).to.have.been.calledWith(`Error fetching canonical URL ${canonicalUrl}: Network error`); + }); + + it('should detect and handle redirect loop correctly', async () => { + const canonicalUrl = 'http://example.com/redirect-loop'; + const visitedUrls = new Set([canonicalUrl]); + + const result = await validateCanonicalRecursively(canonicalUrl, log, visitedUrls); + + expect(result).to.deep.include({ + check: 'canonical-url-no-redirect', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.explanation, + }); + expect(log.info).to.have.been.calledWith(`Detected a redirect loop for canonical URL ${canonicalUrl}`); + }); + + it('should handle 4xx error response correctly', async () => { + const canonicalUrl = 'http://example.com/404'; + nock('http://example.com').get('/404').reply(404); + + const result = await validateCanonicalRecursively(canonicalUrl, log); + + expect(result).to.deep.include({ + check: 'canonical-url-4xx', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_4XX.explanation, + }); + expect(log.info).to.have.been.calledWith(`Canonical URL ${canonicalUrl} returned a 4xx error: 404`); + }); + + it('should handle 5xx error response correctly', async () => { + const canonicalUrl = 'http://example.com/500'; + nock('http://example.com').get('/500').reply(500); + + const result = await validateCanonicalRecursively(canonicalUrl, log); + + expect(result).to.deep.include({ + check: 'canonical-url-5xx', + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_URL_5XX.explanation, + }); + }); + + it('should correctly resolve relative canonical URL with base URL', async () => { + const url = 'https://example.com/some-page'; + const href = '/canonical-page'; + const expectedCanonicalUrl = 'https://example.com/canonical-page'; + + const html = ` + + + + + +

Test Page

+ + + `; + + nock('https://example.com') + .get('/some-page') + .reply(200, html); + + const result = await validateCanonicalTag(url, log); + + // ensure that the resolved canonical URL is correct + expect(result.canonicalUrl).to.equal(expectedCanonicalUrl); + expect(result.checks).to.deep.include({ + check: CANONICAL_CHECKS.CANONICAL_TAG_NONEMPTY.check, + success: true, + }); + expect(result.checks).to.deep.include({ + check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check, + success: false, + explanation: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.explanation, + }); + expect(log.info).to.have.been.calledWith(`Canonical URL ${expectedCanonicalUrl} does not reference itself`); + }); + + it('should handle unexpected status code response correctly', async () => { + const canonicalUrl = 'http://example.com/300'; + nock('http://example.com').get('/300').reply(300); + + const result = await validateCanonicalRecursively(canonicalUrl, log); + + expect(result).to.deep.include({ + check: 'unexpected-status-code', + success: false, + explanation: CANONICAL_CHECKS.UNEXPECTED_STATUS_CODE.explanation, + }); + expect(log.info).to.have.been.calledWith(`Unexpected status code 300 for canonical URL: ${canonicalUrl}`); + }); + }); + + describe('canonicalAuditRunner', () => { + it('should run canonical audit successfully', async () => { + const baseURL = 'http://example.com'; + const html = ``; + + nock('http://example.com').get('/page1').reply(200, html); + nock(baseURL).get('/').reply(200, html); + const getTopPagesForSiteStub = sinon.stub().resolves([{ getURL: () => 'http://example.com/page1' }]); + + const context = { + log, + dataAccess: { + getTopPagesForSite: getTopPagesForSiteStub, + }, + }; + const site = { getId: () => 'testSiteId' }; + + const result = await canonicalAuditRunner(baseURL, context, site); + + expect(result).to.be.an('object'); + expect(result.auditResult).to.have.all.keys( + 'canonical-self-referenced', + 'canonical-tag-exists', + 'canonical-tag-in-head', + 'canonical-tag-nonempty', + 'canonical-url-absolute', + 'canonical-url-lowercased', + 'canonical-url-same-domain', + 'canonical-url-same-protocol', + 'canonical-url-no-redirect', + 'canonical-url-status-ok', + ); + expect(getTopPagesForSiteStub).to.have.been.calledOnceWith('testSiteId', 'ahrefs', 'global'); + expect(log.info).to.have.been.called; + }); + + it('should return early and log a message when no top pages are found', async () => { + const baseURL = 'http://example.com'; + const context = { + log, + dataAccess: { + getTopPagesForSite: sinon.stub().resolves([]), + }, + }; + const site = { getId: () => 'testSiteId' }; + + const result = await canonicalAuditRunner(baseURL, context, site); + + expect(result).to.deep.equal({ + fullAuditRef: baseURL, + auditResult: { + check: 'top-pages', + success: false, + explanation: CANONICAL_CHECKS.TOPPAGES.explanation, + }, + }); + expect(log.info).to.have.been.calledWith('No top pages found, ending audit.'); + }); + + it('should log a simplified error and return a failed audit result if an exception occurs', async () => { + const baseURL = 'http://example.com'; + const context = { log, dataAccess: { getTopPagesForSite: sinon.stub().rejects(new Error('Test Error')) } }; + const site = { getId: () => 'testSiteId' }; + + const result = await canonicalAuditRunner(baseURL, context, site); + + // verify that the returned audit result indicates a failure with an error message + expect(result).to.deep.equal({ + fullAuditRef: baseURL, + auditResult: { + error: 'Audit failed with error: Test Error', + success: false, + }, + }); + }); + }); +});