From e37fbc548518845a9b6985aa9d0cee3cbaed8817 Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Thu, 29 Aug 2024 16:47:09 +0530 Subject: [PATCH] feat: remove keyword inclusion check, add support for s3 scraped tags object --- src/metatags/handler.js | 36 +++------- src/metatags/seo-checks.js | 38 +--------- test/audits/metatags.test.js | 131 +++++++++++++++++++++++++---------- 3 files changed, 104 insertions(+), 101 deletions(-) diff --git a/src/metatags/handler.js b/src/metatags/handler.js index d95a6976..372e0d8f 100644 --- a/src/metatags/handler.js +++ b/src/metatags/handler.js @@ -13,36 +13,23 @@ import { internalServerError, noContent, notFound, ok, } from '@adobe/spacecat-shared-http-utils'; -import { JSDOM } from 'jsdom'; import { retrieveSiteBySiteId } from '../utils/data-access.js'; import { getObjectFromKey, getObjectKeysUsingPrefix } from '../utils/s3-utils.js'; -import { DESCRIPTION, H1, TITLE } from './constants.js'; import SeoChecks from './seo-checks.js'; -function extractTagsFromHtml(htmlContent) { - const dom = new JSDOM(htmlContent); - const doc = dom.window.document; - - const title = doc.querySelector('title')?.textContent; - const description = doc.querySelector('meta[name="description"]')?.getAttribute('content'); - const h1Tags = Array.from(doc.querySelectorAll('h1')).map((h1) => h1.textContent); - return { - [TITLE]: title, - [DESCRIPTION]: description, - [H1]: h1Tags, - }; -} - async function fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log) { const object = await getObjectFromKey(s3Client, bucketName, key, log); - if (!object?.Body?.rawBody || typeof object.Body.rawBody !== 'string') { - log.error(`No Scraped html found in S3 ${key} object`); + if (!object?.Body?.tags || typeof object.Body.tags !== 'object') { + log.error(`No Scraped tags found in S3 ${key} object`); return null; } - const tags = extractTagsFromHtml(object.Body.rawBody); const pageUrl = key.slice(prefix.length - 1).replace('.json', ''); // Remove the prefix and .json suffix return { - [pageUrl]: tags, + [pageUrl]: { + title: object.Body.tags.title, + description: object.Body.tags.description, + h1: object.Body.tags.h1 || [], + }, }; } @@ -83,15 +70,8 @@ export default async function auditMetaTags(message, context) { log.error(`Failed to extract tags from scraped content for bucket ${bucketName} and prefix ${prefix}`); return notFound('Site tags data not available'); } - // Fetch keywords for top pages - const topPages = await dataAccess.getTopPagesForSite(siteId, 'ahrefs', 'global'); - const keywords = {}; - topPages.forEach((page) => { - const endpoint = new URL(page.getURL).pathname; - keywords[endpoint] = page.getTopKeyword(); - }); // Perform SEO checks - const seoChecks = new SeoChecks(log, keywords); + const seoChecks = new SeoChecks(log); for (const [pageUrl, pageTags] of Object.entries(extractedTags)) { seoChecks.performChecks(pageUrl, pageTags); } diff --git a/src/metatags/seo-checks.js b/src/metatags/seo-checks.js index 361cb9df..07682832 100644 --- a/src/metatags/seo-checks.js +++ b/src/metatags/seo-checks.js @@ -20,9 +20,8 @@ import { } from './constants.js'; class SeoChecks { - constructor(log, keywords) { + constructor(log) { this.log = log; - this.keywords = keywords; this.detectedTags = { [TITLE]: [], [DESCRIPTION]: [], @@ -131,45 +130,13 @@ class SeoChecks { this.addDetectedTagEntry( url, H1, - pageTags[H1], + JSON.stringify(pageTags[H1]), MODERATE, `There are ${pageTags[H1].length} H1 tags on this page, which is more than the recommended count of 1.`, ); } } - /** - * Checks for keyword inclusion in the tags and adds to detected tags array if found lacking. - * @param {string} url - The URL of the page. - * @param {object} pageTags - An object containing the tags of the page. - */ - checkForKeywordInclusion(url, pageTags) { - if (!this.keywords[url]) { - this.log.warn(`Keyword Inclusion check failed, keyword not found for ${url}`); - return; - } - const keyword = this.keywords[url].toLowerCase(); - - const tags = { - [TITLE]: pageTags[TITLE], - [DESCRIPTION]: pageTags[DESCRIPTION], - [H1]: pageTags[H1][0], - }; - - Object.entries(tags).forEach(([tagName, tagContent]) => { - if (!tagContent?.toLowerCase().includes(keyword)) { - this.addDetectedTagEntry( - url, - tagName, - tagContent, - HIGH, - `The ${tagName} tag on this page is missing the page's top keyword '${keyword}'. ` - + `It's recommended to include the primary keyword in the ${tagName} tag.`, - ); - } - }); - } - /** * Checks for tag uniqueness and adds to detected tags array if found lacking. * @param {object} pageTags - An object containing the tags of the page. @@ -205,7 +172,6 @@ class SeoChecks { this.checkForMissingTags(url, pageTags); this.checkForTagsLength(url, pageTags); this.checkForH1Count(url, pageTags); - this.checkForKeywordInclusion(url, pageTags); this.checkForUniqueness(url, pageTags); } diff --git a/test/audits/metatags.test.js b/test/audits/metatags.test.js index 7939594b..60997acb 100644 --- a/test/audits/metatags.test.js +++ b/test/audits/metatags.test.js @@ -128,37 +128,13 @@ describe('Meta Tags', () => { expect(seoChecks.detectedTags[H1][0]).to.deep.equal({ pageUrl: 'https://example.com', tagName: H1, - tagContent: ['First H1', 'Second H1'], + tagContent: JSON.stringify(['First H1', 'Second H1']), seoImpact: MODERATE, seoOpportunityText: 'There are 2 H1 tags on this page, which is more than the recommended count of 1.', }); }); }); - describe('checkForKeywordInclusion', () => { - it('should detect missing keywords in tags', () => { - const pageTags = { - [TITLE]: 'Some other title', - [DESCRIPTION]: 'Some other description', - [H1]: ['Some other H1'], - }; - - seoChecks.checkForKeywordInclusion('https://example.com', pageTags); - - expect(seoChecks.detectedTags[TITLE]).to.have.lengthOf(1); - expect(seoChecks.detectedTags[DESCRIPTION]).to.have.lengthOf(1); - expect(seoChecks.detectedTags[H1]).to.have.lengthOf(1); - }); - - it('should log a warning if the keyword is not found for the URL', () => { - const logSpy = sinon.spy(logMock, 'warn'); - seoChecks.checkForKeywordInclusion('https://unknown.com', {}); - - expect(logSpy.calledOnce).to.be.true; - expect(logSpy.firstCall.args[0]).to.equal('Keyword Inclusion check failed, keyword not found for https://unknown.com'); - }); - }); - describe('checkForUniqueness', () => { it('should detect duplicate tags', () => { const pageTags1 = { @@ -182,6 +158,7 @@ describe('Meta Tags', () => { }); }); }); + describe('handler method', () => { let message; let context; @@ -279,7 +256,10 @@ describe('Meta Tags', () => { }).returns({ promise: sinon.stub().resolves({ Body: { - rawBody: 'Test Page', + tags: { + title: 'Test Page', + description: '', + }, }, }), }); @@ -289,7 +269,10 @@ describe('Meta Tags', () => { }).returns({ promise: sinon.stub().resolves({ Body: { - rawBody: 'Test Page

This is a dummy H1 that is overly length from SEO perspective

', + tags: { + title: 'Test Page', + description: 'This is a dummy H1 that is overly length from SEO perspective', + }, }, }), }); @@ -332,26 +315,100 @@ describe('Meta Tags', () => { seoOpportunityText: 'The description tag on this page has a length of 0 characters, which is below the recommended length of 140-160 characters.', }, { - pageUrl: '/blog/page1', + pageUrl: '/blog/page2', tagName: 'description', tagContent: '', seoImpact: 'High', - seoOpportunityText: "The description tag on this page is missing the page's top keyword 'page'. It's recommended to include the primary keyword in the description tag.", + seoOpportunityText: "The description tag on this page is missing. It's recommended to have a description tag on each page.", }, + ], + h1: [ { - pageUrl: '/blog/page2', - tagName: 'description', + pageUrl: '/blog/page1', + tagName: 'h1', tagContent: '', seoImpact: 'High', - seoOpportunityText: "The description tag on this page is missing. It's recommended to have a description tag on each page.", + seoOpportunityText: "The h1 tag on this page is missing. It's recommended to have a h1 tag on each page.", }, { pageUrl: '/blog/page2', - tagName: 'description', - seoImpact: 'High', - seoOpportunityText: "The description tag on this page is missing the page's top keyword 'test'. It's recommended to include the primary keyword in the description tag.", + tagName: 'h1', + tagContent: 'This is a dummy H1 that is overly length from SEO perspective', + seoImpact: 'Moderate', + seoOpportunityText: 'The h1 tag on this page has a length of 61 characters, which is above the recommended length of 60 characters.', }, ], + })); + expect(addAuditStub.calledOnce).to.be.true; + expect(logStub.info.calledTwice).to.be.true; + }); + + it('should process site tags and perform SEO checks for pages with invalid H1s', async () => { + const site = { isLive: sinon.stub().returns(true), getId: sinon.stub().returns('site-id') }; + const topPages = [{ getURL: 'http://example.com/blog/page1', getTopKeyword: sinon.stub().returns('page') }, + { getURL: 'http://example.com/blog/page2', getTopKeyword: sinon.stub().returns('Test') }]; + + dataAccessStub.getSiteByID.resolves(site); + dataAccessStub.getConfiguration.resolves({ + isHandlerEnabledForSite: sinon.stub().returns(true), + }); + dataAccessStub.getTopPagesForSite.resolves(topPages); + + s3ClientStub.send + .withArgs(sinon.match.instanceOf(ListObjectsV2Command).and(sinon.match.has('input', { + Bucket: 'test-bucket', + Prefix: 'scrapes/site-id/', + MaxKeys: 1000, + }))) + .resolves({ + Contents: [ + { Key: 'scrapes/site-id/blog/page1.json' }, + { Key: 'scrapes/site-id/blog/page2.json' }, + ], + }); + + s3ClientStub.getObject.withArgs({ + Bucket: 'test-bucket', + Key: 'scrapes/site-id/blog/page1.json', + }).returns({ + promise: sinon.stub().resolves({ + Body: { + tags: { + title: 'This is an SEO optimal page1 valid title.', + description: 'This is a dummy description that is optimal from SEO perspective for page1. It has the correct length of characters, and is unique across all pages.', + h1: [ + 'This is an overly long H1 tag from SEO perspective due to its length exceeding 60 chars', + 'This is second h1 tag on same page', + ], + }, + }, + }), + }); + s3ClientStub.getObject.withArgs({ + Bucket: 'test-bucket', + Key: 'scrapes/site-id/blog/page2.json', + }).returns({ + promise: sinon.stub().resolves({ + Body: { + tags: { + title: 'This is a SEO wise optimised page2 title.', + description: 'This is a dummy description that is optimal from SEO perspective for page2. It has the correct length of characters, and is unique across all pages.', + h1: [ + 'This is an overly long H1 tag from SEO perspective', + ], + }, + }, + }), + }); + const addAuditStub = sinon.stub().resolves(); + dataAccessStub.addAudit = addAuditStub; + + const result = await auditMetaTags(message, context); + + expect(JSON.stringify(result)).to.equal(JSON.stringify(noContent())); + expect(addAuditStub.calledWithMatch({ + title: [], + description: [], h1: [ { pageUrl: '/blog/page1', @@ -435,7 +492,7 @@ describe('Meta Tags', () => { expect(logStub.error.calledTwice).to.be.true; }); - it('should handle gracefully if S3 object is not a html', async () => { + it('should handle gracefully if S3 tags object is not valid', async () => { const site = { isLive: sinon.stub().returns(true), getId: sinon.stub().returns('site-id') }; const topPages = [{ getURL: 'http://example.com/blog/page1', getTopKeyword: sinon.stub().returns('page') }, { getURL: 'http://example.com/blog/page2', getTopKeyword: sinon.stub().returns('Test') }]; @@ -461,7 +518,7 @@ describe('Meta Tags', () => { s3ClientStub.getObject.returns({ promise: sinon.stub().resolves({ Body: { - rawBody: 5, + tags: 5, }, }), });