Skip to content

Commit

Permalink
Merge branch 'main' into canonical
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreiAlexandruParaschiv authored Aug 14, 2024
2 parents 076f553 + 750f19a commit 5f4d79c
Show file tree
Hide file tree
Showing 2 changed files with 194 additions and 38 deletions.
57 changes: 46 additions & 11 deletions src/sitemap/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -146,17 +146,35 @@ export async function checkSitemap(sitemapUrl) {
}

/**
* Checks for common sitemap URLs and returns any that exist.
* @param {Array<string>} urls - Array of URLs to check.
* @returns {Promise<Array<string>>} - List of sitemap URLs that exist.
* Filters a list of URLs to return only those that exist.
*
* @async
* @param {string[]} urls - An array of URLs to check.
* @param {Object} log - The logging object to record information and errors.
* @returns {Promise<string[]>} - A promise that resolves to an array of URLs that exist.
*/
async function checkCommonSitemapUrls(urls) {
async function filterValidUrls(urls, log) {
const fetchPromises = urls.map(async (url) => {
const response = await fetch(url, { method: 'HEAD' });
return response.ok ? url : null;
try {
const response = await fetch(url, { method: 'HEAD' });
if (response.ok) {
return url;
} else {
log.info(`URL ${url} returned status code ${response.status}`);
return null;
}
} catch (error) {
log.error(`Failed to fetch URL ${url}: ${error.message}`);
return null;
}
});
const results = await Promise.all(fetchPromises);
return results.filter((url) => url !== null);

const results = await Promise.allSettled(fetchPromises);

// filter only the fulfilled promises that have a valid URL
return results
.filter((result) => result.status === 'fulfilled' && result.value !== null)
.map((result) => result.value);
}

/**
Expand Down Expand Up @@ -232,9 +250,10 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls) {
* The extracted paths response length < 0, log messages and returns the failure status and reasons.
*
* @param {string} inputUrl - The URL for which to find and validate the sitemap
* @param log
* @returns {Promise<{success: boolean, reasons: Array<{value}>, paths?: any}>} result of sitemap
*/
export async function findSitemap(inputUrl) {
export async function findSitemap(inputUrl, log) {
const logMessages = [];

const parsedUrl = extractDomainAndProtocol(inputUrl);
Expand Down Expand Up @@ -263,7 +282,7 @@ export async function findSitemap(inputUrl) {

if (!sitemapUrls.length) {
const commonSitemapUrls = [`${protocol}://${domain}/sitemap.xml`, `${protocol}://${domain}/sitemap_index.xml`];
sitemapUrls = await checkCommonSitemapUrls(commonSitemapUrls);
sitemapUrls = await filterValidUrls(commonSitemapUrls, log);
if (!sitemapUrls.length) {
logMessages.push({ value: `No sitemap found in robots.txt or common paths for ${protocol}://${domain}`, error: ERROR_CODES.NO_SITEMAP_IN_ROBOTS });
return { success: false, reasons: logMessages };
Expand All @@ -276,6 +295,22 @@ export async function findSitemap(inputUrl) {
);
const extractedPaths = await getBaseUrlPagesFromSitemaps(inputUrl, filteredSitemapUrls);

// check if URLs from each sitemap exist and remove entries if none exist
if (Object.entries(extractedPaths).length > 0) {
const extractedSitemapUrls = Object.keys(extractedPaths);
for (const s of extractedSitemapUrls) {
const urlsToCheck = extractedPaths[s];
// eslint-disable-next-line no-await-in-loop
const existingPages = await filterValidUrls(urlsToCheck, log);

if (existingPages.length === 0) {
delete extractedPaths[s];
} else {
extractedPaths[s] = existingPages;
}
}
}

if (Object.entries(extractedPaths).length > 0) {
logMessages.push({ value: 'Sitemaps found and validated successfully.' });
return {
Expand All @@ -299,7 +334,7 @@ export async function sitemapAuditRunner(baseURL, context) {
const { log } = context;
log.info(`Received sitemap audit request for ${baseURL}`);
const startTime = process.hrtime();
const auditResult = await findSitemap(baseURL);
const auditResult = await findSitemap(baseURL, log);

const endTime = process.hrtime(startTime);
const elapsedSeconds = endTime[0] + endTime[1] / 1e9;
Expand Down
175 changes: 148 additions & 27 deletions test/audits/sitemap.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,6 @@ describe('Sitemap Audit', () => {
+ `<sitemap><loc>${url}/sitemap_bar.xml</loc></sitemap>\n`
+ '</sitemapindex>';

let topPagesResponse;

beforeEach('setup', () => {
context = new MockContextBuilder()
.withSandbox(sandbox)
Expand All @@ -78,29 +76,6 @@ describe('Sitemap Audit', () => {
nock(url)
.get('/sitemap_bar.xml')
.reply(200, sampleSitemapTwo);

topPagesResponse = {
result: {
pages: [
{
url: `${url}/foo`,
sum_traffic: 100,
},
{
url: `${url}/bar`,
sum_traffic: 200,
},
{
url: `${url}/baz`,
sum_traffic: 300,
},
{
url: `${url}/cux`,
sum_traffic: 400,
},
],
},
};
});

afterEach(() => {
Expand All @@ -115,6 +90,30 @@ describe('Sitemap Audit', () => {
.get('/robots.txt')
.reply(200, `Sitemap: ${url}/sitemap_foo.xml\nSitemap: ${url}/sitemap_bar.xml`);

nock(url)
.head('/sitemap_foo.xml')
.reply(200);

nock(url)
.head('/sitemap_bar.xml')
.reply(200);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand All @@ -137,9 +136,35 @@ describe('Sitemap Audit', () => {
nock(url)
.get('/robots.txt')
.reply(200, `Sitemap: ${url}/sitemap_index.xml`);

nock(url)
.get('/sitemap_index.xml')
.reply(200, sitemapIndex);

nock(url)
.head('/sitemap_foo.xml')
.reply(200);

nock(url)
.head('/sitemap_bar.xml')
.reply(200);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand All @@ -166,10 +191,27 @@ describe('Sitemap Audit', () => {
nock(url)
.get('/sitemap_foo.txt')
.reply(200, `${url}/foo\n${url}/bar`, { 'content-type': 'text/plain' });

nock(url)
.get('/sitemap_bar.txt')
.reply(200, `${url}/baz\n${url}/cux`, { 'content-type': 'text/plain' });

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand Down Expand Up @@ -205,6 +247,14 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

const result = await sitemapAuditRunner(url, context);
expect(result).to.eql({
auditResult: {
Expand Down Expand Up @@ -442,6 +492,32 @@ describe('Sitemap Audit', () => {
}]);
});

it('should delete extracted paths when no valid pages exist', async () => {
nock(url)
.get('/robots.txt')
.reply(200, `Sitemap: ${url}/sitemap.xml`);

nock(url)
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(404);

nock(url)
.head('/bar')
.reply(404);

const result = await findSitemap(url);

expect(result.success).to.equal(false);
expect(result.reasons).to.deep.include({
value: 'No valid paths extracted from sitemaps.',
error: ERROR_CODES.NO_PATHS_IN_SITEMAP,
});
});

it('should return success when sitemap is found in robots.txt', async () => {
nock(url)
.get('/robots.txt')
Expand All @@ -451,6 +527,14 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

const result = await findSitemap(url);
expect(result.success).to.equal(true);
expect(result.paths).to.deep.equal({
Expand All @@ -475,6 +559,14 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemap);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

const result = await findSitemap('https://some-domain.adobe');
expect(result.success).to.equal(true);
expect(result.paths).to.deep.equal({
Expand All @@ -499,6 +591,30 @@ describe('Sitemap Audit', () => {
.get('/sitemap_index.xml')
.reply(200, sitemapIndex);

nock(url)
.head('/sitemap_foo.xml')
.reply(200);

nock(url)
.head('/sitemap_bar.xml')
.reply(200);

nock(url)
.head('/foo')
.reply(200);

nock(url)
.head('/bar')
.reply(200);

nock(url)
.head('/baz')
.reply(200);

nock(url)
.head('/cux')
.reply(200);

const result = await findSitemap(url);
expect(result.success).to.equal(true);
expect(result.paths).to.deep.equal({
Expand All @@ -516,8 +632,13 @@ describe('Sitemap Audit', () => {
.get('/sitemap.xml')
.reply(200, sampleSitemapMoreUrlsWWW);

topPagesResponse.result.pages[0].url = `${protocol}://www.${domain}/foo`;
topPagesResponse.result.pages[1].url = `${protocol}://www.${domain}/bar`;
nock(`${protocol}://www.${domain}`)
.head('/foo')
.reply(200);

nock(`${protocol}://www.${domain}`)
.head('/bar')
.reply(200);

const result = await findSitemap(`${protocol}://www.${domain}`);
expect(result.success).to.equal(true);
Expand Down

0 comments on commit 5f4d79c

Please sign in to comment.