Skip to content

Commit

Permalink
Merge branch 'main' into timeoutfct
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreiAlexandruParaschiv authored Aug 14, 2024
2 parents ddaa4d1 + 02fdf93 commit e8cc7e8
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 41 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# [1.24.0](https://github.com/adobe/spacecat-audit-worker/compare/v1.23.5...v1.24.0) (2024-08-14)


### Features

* verify sitemap urls ([#266](https://github.com/adobe/spacecat-audit-worker/issues/266)) ([750f19a](https://github.com/adobe/spacecat-audit-worker/commit/750f19a809ace64609a2c7dec9101c0cb1cd882b))

## [1.23.5](https://github.com/adobe/spacecat-audit-worker/compare/v1.23.4...v1.23.5) (2024-08-10)


Expand Down
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@adobe/spacecat-audit-worker",
"version": "1.23.5",
"version": "1.24.0",
"description": "SpaceCat Audit Worker",
"main": "src/index.js",
"type": "module",
Expand Down
57 changes: 46 additions & 11 deletions src/sitemap/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -146,17 +146,35 @@ export async function checkSitemap(sitemapUrl) {
}

/**
* Checks for common sitemap URLs and returns any that exist.
* @param {Array<string>} urls - Array of URLs to check.
* @returns {Promise<Array<string>>} - List of sitemap URLs that exist.
* Filters a list of URLs to return only those that exist.
*
* @async
* @param {string[]} urls - An array of URLs to check.
* @param {Object} log - The logging object to record information and errors.
* @returns {Promise<string[]>} - A promise that resolves to an array of URLs that exist.
*/
async function checkCommonSitemapUrls(urls) {
async function filterValidUrls(urls, log) {
const fetchPromises = urls.map(async (url) => {
const response = await fetch(url, { method: 'HEAD' });
return response.ok ? url : null;
try {
const response = await fetch(url, { method: 'HEAD' });
if (response.ok) {
return url;
} else {
log.info(`URL ${url} returned status code ${response.status}`);
return null;
}
} catch (error) {
log.error(`Failed to fetch URL ${url}: ${error.message}`);
return null;
}
});
const results = await Promise.all(fetchPromises);
return results.filter((url) => url !== null);

const results = await Promise.allSettled(fetchPromises);

// filter only the fulfilled promises that have a valid URL
return results
.filter((result) => result.status === 'fulfilled' && result.value !== null)
.map((result) => result.value);
}

/**
Expand Down Expand Up @@ -232,9 +250,10 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls) {
* The extracted paths response length < 0, log messages and returns the failure status and reasons.
*
* @param {string} inputUrl - The URL for which to find and validate the sitemap
* @param log
* @returns {Promise<{success: boolean, reasons: Array<{value}>, paths?: any}>} result of sitemap
*/
export async function findSitemap(inputUrl) {
export async function findSitemap(inputUrl, log) {
const logMessages = [];

const parsedUrl = extractDomainAndProtocol(inputUrl);
Expand Down Expand Up @@ -263,7 +282,7 @@ export async function findSitemap(inputUrl) {

if (!sitemapUrls.length) {
const commonSitemapUrls = [`${protocol}://${domain}/sitemap.xml`, `${protocol}://${domain}/sitemap_index.xml`];
sitemapUrls = await checkCommonSitemapUrls(commonSitemapUrls);
sitemapUrls = await filterValidUrls(commonSitemapUrls, log);
if (!sitemapUrls.length) {
logMessages.push({ value: `No sitemap found in robots.txt or common paths for ${protocol}://${domain}`, error: ERROR_CODES.NO_SITEMAP_IN_ROBOTS });
return { success: false, reasons: logMessages };
Expand All @@ -276,6 +295,22 @@ export async function findSitemap(inputUrl) {
);
const extractedPaths = await getBaseUrlPagesFromSitemaps(inputUrl, filteredSitemapUrls);

// check if URLs from each sitemap exist and remove entries if none exist
if (Object.entries(extractedPaths).length > 0) {
const extractedSitemapUrls = Object.keys(extractedPaths);
for (const s of extractedSitemapUrls) {
const urlsToCheck = extractedPaths[s];
// eslint-disable-next-line no-await-in-loop
const existingPages = await filterValidUrls(urlsToCheck, log);

if (existingPages.length === 0) {
delete extractedPaths[s];
} else {
extractedPaths[s] = existingPages;
}
}
}

if (Object.entries(extractedPaths).length > 0) {
logMessages.push({ value: 'Sitemaps found and validated successfully.' });
return {
Expand All @@ -299,7 +334,7 @@ export async function sitemapAuditRunner(baseURL, context) {
const { log } = context;
log.info(`Received sitemap audit request for ${baseURL}`);
const startTime = process.hrtime();
const auditResult = await findSitemap(baseURL);
const auditResult = await findSitemap(baseURL, log);

const endTime = process.hrtime(startTime);
const elapsedSeconds = endTime[0] + endTime[1] / 1e9;
Expand Down
175 changes: 148 additions & 27 deletions test/audits/sitemap.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@ describe('Sitemap Audit', () => {
+ `<sitemap><loc>${url}/sitemap_bar.xml</loc></sitemap>\n`
+ '</sitemapindex>';

let topPagesResponse;

beforeEach('setup', () => {
context = new MockContextBuilder()
.withSandbox(sandbox)
Expand All @@ -78,29 +76,6 @@ describe('Sitemap Audit', () => {
nock(url)
.get('/sitemap_bar.xml')
.reply(200, sampleSitemapTwo);

topPagesResponse = {
result: {
pages: [
{
url: `${url}/foo`,
sum_traffic: 100,
},
{
url: `${url}/bar`,
sum_traffic: 200,
},
{
url: `${url}/baz`,
sum_traffic: 300,
},
{
url: `${url}/cux`,
sum_traffic: 400,
},
],
},
};
});

afterEach(() => {
Expand All @@ -115,6 +90,30 @@ describe('Sitemap Audit', () => {
.get('/robots.txt')
.reply(200, `Sitemap: ${url}/sitemap_foo.xml\nSitemap: ${url}/sitemap_bar.xml`);

nock(url)
.head('/sitemap_foo.xml')
.reply(200);

nock(url)
.head('/sitemap_bar.xml')
.reply(200);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand All @@ -137,9 +136,35 @@ describe('Sitemap Audit', () => {
nock(url)
.get('/robots.txt')
.reply(200, `Sitemap: ${url}/sitemap_index.xml`);

nock(url)
.get('/sitemap_index.xml')
.reply(200, sitemapIndex);

nock(url)
.head('/sitemap_foo.xml')
.reply(200);

nock(url)
.head('/sitemap_bar.xml')
.reply(200);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand All @@ -166,10 +191,27 @@ describe('Sitemap Audit', () => {
nock(url)
.get('/sitemap_foo.txt')
.reply(200, `${url}/foo\n${url}/bar`, { 'content-type': 'text/plain' });

nock(url)
.get('/sitemap_bar.txt')
.reply(200, `${url}/baz\n${url}/cux`, { 'content-type': 'text/plain' });

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand Down Expand Up @@ -205,6 +247,14 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand Down Expand Up @@ -442,6 +492,32 @@ describe('Sitemap Audit', () => {
}]);
});

it('should delete extracted paths when no valid pages exist', async () => {
nock(url)
.get('/robots.txt')
.reply(200, `Sitemap: ${url}/sitemap.xml`);

nock(url)
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(404);

nock(url)
.head('/bar')
.reply(404);

const result = await findSitemap(url);

expect(result.success).to.equal(false);
expect(result.reasons).to.deep.include({
value: 'No valid paths extracted from sitemaps.',
error: ERROR_CODES.NO_PATHS_IN_SITEMAP,
});
});

it('should return success when sitemap is found in robots.txt', async () => {
nock(url)
.get('/robots.txt')
Expand All @@ -451,6 +527,14 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

const result = await findSitemap(url);
expect(result.success).to.equal(true);
expect(result.paths).to.deep.equal({
Expand All @@ -475,6 +559,14 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

const result = await findSitemap('https://some-domain.adobe');
expect(result.success).to.equal(true);
expect(result.paths).to.deep.equal({
Expand All @@ -499,6 +591,30 @@ describe('Sitemap Audit', () => {
.get('/sitemap_index.xml')
.reply(200, sitemapIndex);

nock(url)
.head('/sitemap_foo.xml')
.reply(200);

nock(url)
.head('/sitemap_bar.xml')
.reply(200);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await findSitemap(url);
expect(result.success).to.equal(true);
expect(result.paths).to.deep.equal({
Expand All @@ -516,8 +632,13 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemapMoreUrlsWWW);

topPagesResponse.result.pages[0].url = `${protocol}://www.${domain}/foo`;
topPagesResponse.result.pages[1].url = `${protocol}://www.${domain}/bar`;
nock(`${protocol}://www.${domain}`)
.head('/foo')
.reply(200);

nock(`${protocol}://www.${domain}`)
.head('/bar')
.reply(200);

const result = await findSitemap(`${protocol}://www.${domain}`);
expect(result.success).to.equal(true);
Expand Down

0 comments on commit e8cc7e8

Please sign in to comment.