Skip to content

Commit

Permalink
fix: unexpected close tag issue
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreiAlexandruParaschiv committed Aug 20, 2024
1 parent f6b6281 commit f9aff42
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 25 deletions.
41 changes: 30 additions & 11 deletions src/sitemap/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,29 @@ const VALID_MIME_TYPES = Object.freeze([
* @throws {Error} If the fetch operation fails or the response status is not OK.
*/
export async function fetchContent(targetUrl) {
const response = await fetch(targetUrl);
if (!response.ok) {
throw new Error(`Failed to fetch content from ${targetUrl}. Status: ${response.status}`);
try {
// Basic URL validation before making the request
const url = new URL(targetUrl);

const response = await fetch(url.toString());

if (!response.ok) {
throw new Error(`Failed to fetch content from ${url}. Status: ${response.status}`);
}

const contentType = response.headers.get('content-type');
const contentPayload = await response.text();

return { payload: contentPayload, type: contentType };
} catch (error) {
if (error instanceof TypeError) {
// TypeError typically indicates an issue with the URL format
throw new Error(`Invalid URL provided: ${targetUrl}. Error: ${error.message}`);
} else {
// Handle other errors (e.g., network issues, non-200 response)
throw new Error(`Failed to fetch content from ${targetUrl}. Error: ${error.message}`);
}
}
return { payload: await response.text(), type: response.headers.get('content-type') };
}

/**
Expand Down Expand Up @@ -198,9 +216,14 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls, log) {
const contentsCache = {};

const fillSitemapContents = async (url) => {
const urlData = await checkSitemap(url, log);
contentsCache[url] = urlData;
return { url, urlData };
try {
const urlData = await checkSitemap(url, log);
contentsCache[url] = urlData;
return { url, urlData };
} catch (err) {
log.error(`Failed to fetch or process sitemap at ${url}: ${err.message}`);
return { url, urlData: null }; // Returning null to mark failure, but not stopping execution
}
};

// Prepare all promises for checking each sitemap URL.
Expand All @@ -215,8 +238,6 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls, log) {
if (urlData.existsAndIsValid) {
if (urlData.details && urlData.details.isSitemapIndex) {
log.info(`Sitemap Index found: ${url}`);

// Await the promise returned by `getSitemapUrlsFromSitemapIndex`
// eslint-disable-next-line no-await-in-loop,max-len
const extractedSitemaps = await getSitemapUrlsFromSitemapIndex(urlData.details.sitemapContent, log, fetchContent);
log.info(`Extracted Sitemaps from Index: ${JSON.stringify(extractedSitemaps)}`);
Expand Down Expand Up @@ -302,7 +323,6 @@ export async function findSitemap(inputUrl, log) {
}
} catch (error) {
logMessages.push({ value: `Error fetching or processing robots.txt: ${error.message}`, error: ERROR_CODES.FETCH_ERROR });
// Don't return failure yet, try the fallback URLs
}

if (!sitemapUrls.length) {
Expand Down Expand Up @@ -378,7 +398,6 @@ export async function sitemapAuditRunner(baseURL, context) {

export default new AuditBuilder()
.withRunner(sitemapAuditRunner)
// .withPersister(() => {})
.withUrlResolver((site) => composeAuditURL(site.getBaseURL())
.then((url) => (getUrlWithoutPath(prependSchema(url)))))
.build();
18 changes: 4 additions & 14 deletions src/support/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,9 @@ export function extractDomainAndProtocol(inputUrl) {
* @returns {Array<string>} An array of URLs extracted from the sitemap.
*/
export function extractUrlsFromSitemap(content, log, tagName = 'url') {
// Initialize JSDOM with the content and specify the XML content type
const dom = new JSDOM(content.payload, { contentType: 'text/xml' });
const { document } = dom.window;

// Retrieve all elements with the specified tag name
const elements = document.getElementsByTagName(tagName);

// Map through the elements, extract the text of the 'loc' tags, and filter out null
Expand All @@ -109,7 +107,7 @@ export function extractUrlsFromSitemap(content, log, tagName = 'url') {
}
return null;
})
.filter((url) => url !== null); // Filter out any nulls if 'loc' element is missing or empty
.filter((url) => url !== null);
}

/**
Expand All @@ -133,18 +131,10 @@ export function getBaseUrlPagesFromSitemapContents(baseUrl, sitemapDetails, log)

if (sitemapDetails && sitemapDetails.isText) {
const lines = sitemapDetails.sitemapContent.payload.split('\n').map((line) => line.trim());
log.info(`Extracted lines from text sitemap: ${lines}`);

const filteredPages = filterPages(lines.filter((line) => line.length > 0));
log.info(`Filtered pages from text sitemap: ${filteredPages}`);
return filteredPages;
return filterPages(lines.filter((line) => line.length > 0));
} else if (sitemapDetails) {
const sitemapPages = extractUrlsFromSitemap(sitemapDetails.sitemapContent, log);
log.info(`Extracted pages from XML sitemap: ${sitemapPages}`);

const filteredPages = filterPages(sitemapPages);
log.info(`Filtered pages from XML sitemap: ${filteredPages}`);
return filteredPages;
return filterPages(sitemapPages);
}

return [];
Expand Down Expand Up @@ -180,7 +170,7 @@ export async function getSitemapUrlsFromSitemapIndex(content, log, fetchContent)
const nestedUrls = getSitemapUrlsFromSitemapIndex(sitemapContent, log, fetchContent);
allUrls.push(...nestedUrls);
} else {
// Otherwise, extract the actual URLs
// extract the actual URLs
const urls = extractUrlsFromSitemap(sitemapContent, log);
allUrls.push(...urls);
}
Expand Down

0 comments on commit f9aff42

Please sign in to comment.