diff --git a/src/sitemap/handler.js b/src/sitemap/handler.js index cb93c8fc..cec82411 100644 --- a/src/sitemap/handler.js +++ b/src/sitemap/handler.js @@ -53,11 +53,29 @@ const VALID_MIME_TYPES = Object.freeze([ * @throws {Error} If the fetch operation fails or the response status is not OK. */ export async function fetchContent(targetUrl) { - const response = await fetch(targetUrl); - if (!response.ok) { - throw new Error(`Failed to fetch content from ${targetUrl}. Status: ${response.status}`); + try { + // Basic URL validation before making the request + const url = new URL(targetUrl); + + const response = await fetch(url.toString()); + + if (!response.ok) { + throw new Error(`Failed to fetch content from ${url}. Status: ${response.status}`); + } + + const contentType = response.headers.get('content-type'); + const contentPayload = await response.text(); + + return { payload: contentPayload, type: contentType }; + } catch (error) { + if (error instanceof TypeError) { + // TypeError typically indicates an issue with the URL format + throw new Error(`Invalid URL provided: ${targetUrl}. Error: ${error.message}`); + } else { + // Handle other errors (e.g., network issues, non-200 response) + throw new Error(`Failed to fetch content from ${targetUrl}. Error: ${error.message}`); + } } - return { payload: await response.text(), type: response.headers.get('content-type') }; } /** @@ -198,9 +216,14 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls, log) { const contentsCache = {}; const fillSitemapContents = async (url) => { - const urlData = await checkSitemap(url, log); - contentsCache[url] = urlData; - return { url, urlData }; + try { + const urlData = await checkSitemap(url, log); + contentsCache[url] = urlData; + return { url, urlData }; + } catch (err) { + log.error(`Failed to fetch or process sitemap at ${url}: ${err.message}`); + return { url, urlData: null }; // Returning null to mark failure, but not stopping execution + } }; // Prepare all promises for checking each sitemap URL. @@ -215,8 +238,6 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls, log) { if (urlData.existsAndIsValid) { if (urlData.details && urlData.details.isSitemapIndex) { log.info(`Sitemap Index found: ${url}`); - - // Await the promise returned by `getSitemapUrlsFromSitemapIndex` // eslint-disable-next-line no-await-in-loop,max-len const extractedSitemaps = await getSitemapUrlsFromSitemapIndex(urlData.details.sitemapContent, log, fetchContent); log.info(`Extracted Sitemaps from Index: ${JSON.stringify(extractedSitemaps)}`); @@ -302,7 +323,6 @@ export async function findSitemap(inputUrl, log) { } } catch (error) { logMessages.push({ value: `Error fetching or processing robots.txt: ${error.message}`, error: ERROR_CODES.FETCH_ERROR }); - // Don't return failure yet, try the fallback URLs } if (!sitemapUrls.length) { @@ -378,7 +398,6 @@ export async function sitemapAuditRunner(baseURL, context) { export default new AuditBuilder() .withRunner(sitemapAuditRunner) - // .withPersister(() => {}) .withUrlResolver((site) => composeAuditURL(site.getBaseURL()) .then((url) => (getUrlWithoutPath(prependSchema(url))))) .build(); diff --git a/src/support/utils.js b/src/support/utils.js index f2deac1b..5404080f 100644 --- a/src/support/utils.js +++ b/src/support/utils.js @@ -91,11 +91,9 @@ export function extractDomainAndProtocol(inputUrl) { * @returns {Array} An array of URLs extracted from the sitemap. */ export function extractUrlsFromSitemap(content, log, tagName = 'url') { - // Initialize JSDOM with the content and specify the XML content type const dom = new JSDOM(content.payload, { contentType: 'text/xml' }); const { document } = dom.window; - // Retrieve all elements with the specified tag name const elements = document.getElementsByTagName(tagName); // Map through the elements, extract the text of the 'loc' tags, and filter out null @@ -109,7 +107,7 @@ export function extractUrlsFromSitemap(content, log, tagName = 'url') { } return null; }) - .filter((url) => url !== null); // Filter out any nulls if 'loc' element is missing or empty + .filter((url) => url !== null); } /** @@ -133,18 +131,10 @@ export function getBaseUrlPagesFromSitemapContents(baseUrl, sitemapDetails, log) if (sitemapDetails && sitemapDetails.isText) { const lines = sitemapDetails.sitemapContent.payload.split('\n').map((line) => line.trim()); - log.info(`Extracted lines from text sitemap: ${lines}`); - - const filteredPages = filterPages(lines.filter((line) => line.length > 0)); - log.info(`Filtered pages from text sitemap: ${filteredPages}`); - return filteredPages; + return filterPages(lines.filter((line) => line.length > 0)); } else if (sitemapDetails) { const sitemapPages = extractUrlsFromSitemap(sitemapDetails.sitemapContent, log); - log.info(`Extracted pages from XML sitemap: ${sitemapPages}`); - - const filteredPages = filterPages(sitemapPages); - log.info(`Filtered pages from XML sitemap: ${filteredPages}`); - return filteredPages; + return filterPages(sitemapPages); } return []; @@ -180,7 +170,7 @@ export async function getSitemapUrlsFromSitemapIndex(content, log, fetchContent) const nestedUrls = getSitemapUrlsFromSitemapIndex(sitemapContent, log, fetchContent); allUrls.push(...nestedUrls); } else { - // Otherwise, extract the actual URLs + // extract the actual URLs const urls = extractUrlsFromSitemap(sitemapContent, log); allUrls.push(...urls); }