From 7a52c1aad2f51f77cdbf4114b2d5b7f5ae7bc068 Mon Sep 17 00:00:00 2001 From: paraschi Date: Tue, 20 Aug 2024 22:49:22 +0300 Subject: [PATCH] fix: unexpected close tag issue --- src/sitemap/handler.js | 6 +++--- src/support/utils.js | 37 ++++++++++++++++++++----------------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/sitemap/handler.js b/src/sitemap/handler.js index 7eac7f02..ec9b1c4b 100644 --- a/src/sitemap/handler.js +++ b/src/sitemap/handler.js @@ -115,10 +115,10 @@ export async function checkRobotsForSitemap(protocol, domain, log) { * @returns {boolean} - True if the sitemap content is valid, otherwise false. */ export function isSitemapContentValid(sitemapContent) { - return sitemapContent.payload.trim().startsWith(' sitemapContent.type.includes(type)); + const payload = sitemapContent.payload.trim(); + return payload.startsWith('') + && VALID_MIME_TYPES.some((type) => sitemapContent.type.includes(type)); } - /** * Checks the validity and existence of a sitemap by fetching its content. * diff --git a/src/support/utils.js b/src/support/utils.js index 5404080f..3edbeea7 100644 --- a/src/support/utils.js +++ b/src/support/utils.js @@ -91,23 +91,26 @@ export function extractDomainAndProtocol(inputUrl) { * @returns {Array} An array of URLs extracted from the sitemap. */ export function extractUrlsFromSitemap(content, log, tagName = 'url') { - const dom = new JSDOM(content.payload, { contentType: 'text/xml' }); - const { document } = dom.window; - - const elements = document.getElementsByTagName(tagName); - - // Map through the elements, extract the text of the 'loc' tags, and filter out null - return Array.from(elements) - .map((element) => { - const loc = element.getElementsByTagName('loc')[0]; - // Check if loc exists before trying to access textContent - if (loc && loc.textContent) { - log.info('Extracted URL:', loc.textContent.trim()); - return loc.textContent.trim(); - } - return null; - }) - .filter((url) => url !== null); + try { + const dom = new JSDOM(content.payload, { contentType: 'text/xml' }); + const { document } = dom.window; + const elements = document.getElementsByTagName(tagName); + + return Array.from(elements) + .map((element) => { + const loc = element.getElementsByTagName('loc')[0]; + if (loc && loc.textContent) { + log.info('Extracted URL:', loc.textContent.trim()); + return loc.textContent.trim(); + } + return null; + }) + .filter((url) => url !== null); + } catch (error) { + log.error(`Failed to parse XML content in sitemap: ${error.message}`); + log.error(`Content received: ${content.payload}`); + return []; + } } /**