diff --git a/CHANGELOG.md b/CHANGELOG.md index a0cbfdac..57166002 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +# [1.24.0](https://github.com/adobe/spacecat-audit-worker/compare/v1.23.5...v1.24.0) (2024-08-14) + + +### Features + +* verify sitemap urls ([#266](https://github.com/adobe/spacecat-audit-worker/issues/266)) ([750f19a](https://github.com/adobe/spacecat-audit-worker/commit/750f19a809ace64609a2c7dec9101c0cb1cd882b)) + ## [1.23.5](https://github.com/adobe/spacecat-audit-worker/compare/v1.23.4...v1.23.5) (2024-08-10) diff --git a/package-lock.json b/package-lock.json index 7d845720..11bf20da 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@adobe/spacecat-audit-worker", - "version": "1.23.5", + "version": "1.24.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@adobe/spacecat-audit-worker", - "version": "1.23.5", + "version": "1.24.0", "license": "Apache-2.0", "dependencies": { "@adobe/fetch": "4.1.8", diff --git a/package.json b/package.json index 8a1d1e7f..b1e645c6 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@adobe/spacecat-audit-worker", - "version": "1.23.5", + "version": "1.24.0", "description": "SpaceCat Audit Worker", "main": "src/index.js", "type": "module", diff --git a/src/sitemap/handler.js b/src/sitemap/handler.js index af9beba1..844c6aed 100644 --- a/src/sitemap/handler.js +++ b/src/sitemap/handler.js @@ -146,17 +146,35 @@ export async function checkSitemap(sitemapUrl) { } /** - * Checks for common sitemap URLs and returns any that exist. - * @param {Array} urls - Array of URLs to check. - * @returns {Promise>} - List of sitemap URLs that exist. + * Filters a list of URLs to return only those that exist. + * + * @async + * @param {string[]} urls - An array of URLs to check. + * @param {Object} log - The logging object to record information and errors. + * @returns {Promise} - A promise that resolves to an array of URLs that exist. */ -async function checkCommonSitemapUrls(urls) { +async function filterValidUrls(urls, log) { const fetchPromises = urls.map(async (url) => { - const response = await fetch(url, { method: 'HEAD' }); - return response.ok ? url : null; + try { + const response = await fetch(url, { method: 'HEAD' }); + if (response.ok) { + return url; + } else { + log.info(`URL ${url} returned status code ${response.status}`); + return null; + } + } catch (error) { + log.error(`Failed to fetch URL ${url}: ${error.message}`); + return null; + } }); - const results = await Promise.all(fetchPromises); - return results.filter((url) => url !== null); + + const results = await Promise.allSettled(fetchPromises); + + // filter only the fulfilled promises that have a valid URL + return results + .filter((result) => result.status === 'fulfilled' && result.value !== null) + .map((result) => result.value); } /** @@ -232,9 +250,10 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls) { * The extracted paths response length < 0, log messages and returns the failure status and reasons. * * @param {string} inputUrl - The URL for which to find and validate the sitemap + * @param log * @returns {Promise<{success: boolean, reasons: Array<{value}>, paths?: any}>} result of sitemap */ -export async function findSitemap(inputUrl) { +export async function findSitemap(inputUrl, log) { const logMessages = []; const parsedUrl = extractDomainAndProtocol(inputUrl); @@ -263,7 +282,7 @@ export async function findSitemap(inputUrl) { if (!sitemapUrls.length) { const commonSitemapUrls = [`${protocol}://${domain}/sitemap.xml`, `${protocol}://${domain}/sitemap_index.xml`]; - sitemapUrls = await checkCommonSitemapUrls(commonSitemapUrls); + sitemapUrls = await filterValidUrls(commonSitemapUrls, log); if (!sitemapUrls.length) { logMessages.push({ value: `No sitemap found in robots.txt or common paths for ${protocol}://${domain}`, error: ERROR_CODES.NO_SITEMAP_IN_ROBOTS }); return { success: false, reasons: logMessages }; @@ -276,6 +295,22 @@ export async function findSitemap(inputUrl) { ); const extractedPaths = await getBaseUrlPagesFromSitemaps(inputUrl, filteredSitemapUrls); + // check if URLs from each sitemap exist and remove entries if none exist + if (Object.entries(extractedPaths).length > 0) { + const extractedSitemapUrls = Object.keys(extractedPaths); + for (const s of extractedSitemapUrls) { + const urlsToCheck = extractedPaths[s]; + // eslint-disable-next-line no-await-in-loop + const existingPages = await filterValidUrls(urlsToCheck, log); + + if (existingPages.length === 0) { + delete extractedPaths[s]; + } else { + extractedPaths[s] = existingPages; + } + } + } + if (Object.entries(extractedPaths).length > 0) { logMessages.push({ value: 'Sitemaps found and validated successfully.' }); return { @@ -299,7 +334,7 @@ export async function sitemapAuditRunner(baseURL, context) { const { log } = context; log.info(`Received sitemap audit request for ${baseURL}`); const startTime = process.hrtime(); - const auditResult = await findSitemap(baseURL); + const auditResult = await findSitemap(baseURL, log); const endTime = process.hrtime(startTime); const elapsedSeconds = endTime[0] + endTime[1] / 1e9; diff --git a/test/audits/sitemap.test.js b/test/audits/sitemap.test.js index 78b7954a..aa84d4c1 100644 --- a/test/audits/sitemap.test.js +++ b/test/audits/sitemap.test.js @@ -65,8 +65,6 @@ describe('Sitemap Audit', () => { + `${url}/sitemap_bar.xml\n` + ''; - let topPagesResponse; - beforeEach('setup', () => { context = new MockContextBuilder() .withSandbox(sandbox) @@ -78,29 +76,6 @@ describe('Sitemap Audit', () => { nock(url) .get('/sitemap_bar.xml') .reply(200, sampleSitemapTwo); - - topPagesResponse = { - result: { - pages: [ - { - url: `${url}/foo`, - sum_traffic: 100, - }, - { - url: `${url}/bar`, - sum_traffic: 200, - }, - { - url: `${url}/baz`, - sum_traffic: 300, - }, - { - url: `${url}/cux`, - sum_traffic: 400, - }, - ], - }, - }; }); afterEach(() => { @@ -115,6 +90,30 @@ describe('Sitemap Audit', () => { .get('/robots.txt') .reply(200, `Sitemap: ${url}/sitemap_foo.xml\nSitemap: ${url}/sitemap_bar.xml`); + nock(url) + .head('/sitemap_foo.xml') + .reply(200); + + nock(url) + .head('/sitemap_bar.xml') + .reply(200); + + nock(url) + .head('/foo') + .reply(200); + + nock(url) + .head('/bar') + .reply(200); + + nock(url) + .head('/baz') + .reply(200); + + nock(url) + .head('/cux') + .reply(200); + const result = await sitemapAuditRunner(url, context); expect(result).to.eql({ auditResult: { @@ -137,9 +136,35 @@ describe('Sitemap Audit', () => { nock(url) .get('/robots.txt') .reply(200, `Sitemap: ${url}/sitemap_index.xml`); + nock(url) .get('/sitemap_index.xml') .reply(200, sitemapIndex); + + nock(url) + .head('/sitemap_foo.xml') + .reply(200); + + nock(url) + .head('/sitemap_bar.xml') + .reply(200); + + nock(url) + .head('/foo') + .reply(200); + + nock(url) + .head('/bar') + .reply(200); + + nock(url) + .head('/baz') + .reply(200); + + nock(url) + .head('/cux') + .reply(200); + const result = await sitemapAuditRunner(url, context); expect(result).to.eql({ auditResult: { @@ -166,10 +191,27 @@ describe('Sitemap Audit', () => { nock(url) .get('/sitemap_foo.txt') .reply(200, `${url}/foo\n${url}/bar`, { 'content-type': 'text/plain' }); + nock(url) .get('/sitemap_bar.txt') .reply(200, `${url}/baz\n${url}/cux`, { 'content-type': 'text/plain' }); + nock(url) + .head('/foo') + .reply(200); + + nock(url) + .head('/bar') + .reply(200); + + nock(url) + .head('/baz') + .reply(200); + + nock(url) + .head('/cux') + .reply(200); + const result = await sitemapAuditRunner(url, context); expect(result).to.eql({ auditResult: { @@ -205,6 +247,14 @@ describe('Sitemap Audit', () => { .get('/sitemap.xml') .reply(200, sampleSitemap); + nock(url) + .head('/foo') + .reply(200); + + nock(url) + .head('/bar') + .reply(200); + const result = await sitemapAuditRunner(url, context); expect(result).to.eql({ auditResult: { @@ -442,6 +492,32 @@ describe('Sitemap Audit', () => { }]); }); + it('should delete extracted paths when no valid pages exist', async () => { + nock(url) + .get('/robots.txt') + .reply(200, `Sitemap: ${url}/sitemap.xml`); + + nock(url) + .get('/sitemap.xml') + .reply(200, sampleSitemap); + + nock(url) + .head('/foo') + .reply(404); + + nock(url) + .head('/bar') + .reply(404); + + const result = await findSitemap(url); + + expect(result.success).to.equal(false); + expect(result.reasons).to.deep.include({ + value: 'No valid paths extracted from sitemaps.', + error: ERROR_CODES.NO_PATHS_IN_SITEMAP, + }); + }); + it('should return success when sitemap is found in robots.txt', async () => { nock(url) .get('/robots.txt') @@ -451,6 +527,14 @@ describe('Sitemap Audit', () => { .get('/sitemap.xml') .reply(200, sampleSitemap); + nock(url) + .head('/foo') + .reply(200); + + nock(url) + .head('/bar') + .reply(200); + const result = await findSitemap(url); expect(result.success).to.equal(true); expect(result.paths).to.deep.equal({ @@ -475,6 +559,14 @@ describe('Sitemap Audit', () => { .get('/sitemap.xml') .reply(200, sampleSitemap); + nock(url) + .head('/foo') + .reply(200); + + nock(url) + .head('/bar') + .reply(200); + const result = await findSitemap('https://some-domain.adobe'); expect(result.success).to.equal(true); expect(result.paths).to.deep.equal({ @@ -499,6 +591,30 @@ describe('Sitemap Audit', () => { .get('/sitemap_index.xml') .reply(200, sitemapIndex); + nock(url) + .head('/sitemap_foo.xml') + .reply(200); + + nock(url) + .head('/sitemap_bar.xml') + .reply(200); + + nock(url) + .head('/foo') + .reply(200); + + nock(url) + .head('/bar') + .reply(200); + + nock(url) + .head('/baz') + .reply(200); + + nock(url) + .head('/cux') + .reply(200); + const result = await findSitemap(url); expect(result.success).to.equal(true); expect(result.paths).to.deep.equal({ @@ -516,8 +632,13 @@ describe('Sitemap Audit', () => { .get('/sitemap.xml') .reply(200, sampleSitemapMoreUrlsWWW); - topPagesResponse.result.pages[0].url = `${protocol}://www.${domain}/foo`; - topPagesResponse.result.pages[1].url = `${protocol}://www.${domain}/bar`; + nock(`${protocol}://www.${domain}`) + .head('/foo') + .reply(200); + + nock(`${protocol}://www.${domain}`) + .head('/bar') + .reply(200); const result = await findSitemap(`${protocol}://www.${domain}`); expect(result.success).to.equal(true);