From e37fbc548518845a9b6985aa9d0cee3cbaed8817 Mon Sep 17 00:00:00 2001
From: Divyansh Pratap <dipratap@adobe.com>
Date: Thu, 29 Aug 2024 16:47:09 +0530
Subject: [PATCH] feat: remove keyword inclusion check, add support for s3
 scraped tags object

---
 src/metatags/handler.js      |  36 +++-------
 src/metatags/seo-checks.js   |  38 +---------
 test/audits/metatags.test.js | 131 +++++++++++++++++++++++++----------
 3 files changed, 104 insertions(+), 101 deletions(-)

diff --git a/src/metatags/handler.js b/src/metatags/handler.js
index d95a6976..372e0d8f 100644
--- a/src/metatags/handler.js
+++ b/src/metatags/handler.js
@@ -13,36 +13,23 @@
 import {
   internalServerError, noContent, notFound, ok,
 } from '@adobe/spacecat-shared-http-utils';
-import { JSDOM } from 'jsdom';
 import { retrieveSiteBySiteId } from '../utils/data-access.js';
 import { getObjectFromKey, getObjectKeysUsingPrefix } from '../utils/s3-utils.js';
-import { DESCRIPTION, H1, TITLE } from './constants.js';
 import SeoChecks from './seo-checks.js';
 
-function extractTagsFromHtml(htmlContent) {
-  const dom = new JSDOM(htmlContent);
-  const doc = dom.window.document;
-
-  const title = doc.querySelector('title')?.textContent;
-  const description = doc.querySelector('meta[name="description"]')?.getAttribute('content');
-  const h1Tags = Array.from(doc.querySelectorAll('h1')).map((h1) => h1.textContent);
-  return {
-    [TITLE]: title,
-    [DESCRIPTION]: description,
-    [H1]: h1Tags,
-  };
-}
-
 async function fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log) {
   const object = await getObjectFromKey(s3Client, bucketName, key, log);
-  if (!object?.Body?.rawBody || typeof object.Body.rawBody !== 'string') {
-    log.error(`No Scraped html found in S3 ${key} object`);
+  if (!object?.Body?.tags || typeof object.Body.tags !== 'object') {
+    log.error(`No Scraped tags found in S3 ${key} object`);
     return null;
   }
-  const tags = extractTagsFromHtml(object.Body.rawBody);
   const pageUrl = key.slice(prefix.length - 1).replace('.json', ''); // Remove the prefix and .json suffix
   return {
-    [pageUrl]: tags,
+    [pageUrl]: {
+      title: object.Body.tags.title,
+      description: object.Body.tags.description,
+      h1: object.Body.tags.h1 || [],
+    },
   };
 }
 
@@ -83,15 +70,8 @@ export default async function auditMetaTags(message, context) {
       log.error(`Failed to extract tags from scraped content for bucket ${bucketName} and prefix ${prefix}`);
       return notFound('Site tags data not available');
     }
-    // Fetch keywords for top pages
-    const topPages = await dataAccess.getTopPagesForSite(siteId, 'ahrefs', 'global');
-    const keywords = {};
-    topPages.forEach((page) => {
-      const endpoint = new URL(page.getURL).pathname;
-      keywords[endpoint] = page.getTopKeyword();
-    });
     // Perform SEO checks
-    const seoChecks = new SeoChecks(log, keywords);
+    const seoChecks = new SeoChecks(log);
     for (const [pageUrl, pageTags] of Object.entries(extractedTags)) {
       seoChecks.performChecks(pageUrl, pageTags);
     }
diff --git a/src/metatags/seo-checks.js b/src/metatags/seo-checks.js
index 361cb9df..07682832 100644
--- a/src/metatags/seo-checks.js
+++ b/src/metatags/seo-checks.js
@@ -20,9 +20,8 @@ import {
 } from './constants.js';
 
 class SeoChecks {
-  constructor(log, keywords) {
+  constructor(log) {
     this.log = log;
-    this.keywords = keywords;
     this.detectedTags = {
       [TITLE]: [],
       [DESCRIPTION]: [],
@@ -131,45 +130,13 @@ class SeoChecks {
       this.addDetectedTagEntry(
         url,
         H1,
-        pageTags[H1],
+        JSON.stringify(pageTags[H1]),
         MODERATE,
         `There are ${pageTags[H1].length} H1 tags on this page, which is more than the recommended count of 1.`,
       );
     }
   }
 
-  /**
-   * Checks for keyword inclusion in the tags and adds to detected tags array if found lacking.
-   * @param {string} url - The URL of the page.
-   * @param {object} pageTags - An object containing the tags of the page.
-   */
-  checkForKeywordInclusion(url, pageTags) {
-    if (!this.keywords[url]) {
-      this.log.warn(`Keyword Inclusion check failed, keyword not found for ${url}`);
-      return;
-    }
-    const keyword = this.keywords[url].toLowerCase();
-
-    const tags = {
-      [TITLE]: pageTags[TITLE],
-      [DESCRIPTION]: pageTags[DESCRIPTION],
-      [H1]: pageTags[H1][0],
-    };
-
-    Object.entries(tags).forEach(([tagName, tagContent]) => {
-      if (!tagContent?.toLowerCase().includes(keyword)) {
-        this.addDetectedTagEntry(
-          url,
-          tagName,
-          tagContent,
-          HIGH,
-          `The ${tagName} tag on this page is missing the page's top keyword '${keyword}'. `
-          + `It's recommended to include the primary keyword in the ${tagName} tag.`,
-        );
-      }
-    });
-  }
-
   /**
    * Checks for tag uniqueness and adds to detected tags array if found lacking.
    * @param {object} pageTags - An object containing the tags of the page.
@@ -205,7 +172,6 @@ class SeoChecks {
     this.checkForMissingTags(url, pageTags);
     this.checkForTagsLength(url, pageTags);
     this.checkForH1Count(url, pageTags);
-    this.checkForKeywordInclusion(url, pageTags);
     this.checkForUniqueness(url, pageTags);
   }
 
diff --git a/test/audits/metatags.test.js b/test/audits/metatags.test.js
index 7939594b..60997acb 100644
--- a/test/audits/metatags.test.js
+++ b/test/audits/metatags.test.js
@@ -128,37 +128,13 @@ describe('Meta Tags', () => {
         expect(seoChecks.detectedTags[H1][0]).to.deep.equal({
           pageUrl: 'https://example.com',
           tagName: H1,
-          tagContent: ['First H1', 'Second H1'],
+          tagContent: JSON.stringify(['First H1', 'Second H1']),
           seoImpact: MODERATE,
           seoOpportunityText: 'There are 2 H1 tags on this page, which is more than the recommended count of 1.',
         });
       });
     });
 
-    describe('checkForKeywordInclusion', () => {
-      it('should detect missing keywords in tags', () => {
-        const pageTags = {
-          [TITLE]: 'Some other title',
-          [DESCRIPTION]: 'Some other description',
-          [H1]: ['Some other H1'],
-        };
-
-        seoChecks.checkForKeywordInclusion('https://example.com', pageTags);
-
-        expect(seoChecks.detectedTags[TITLE]).to.have.lengthOf(1);
-        expect(seoChecks.detectedTags[DESCRIPTION]).to.have.lengthOf(1);
-        expect(seoChecks.detectedTags[H1]).to.have.lengthOf(1);
-      });
-
-      it('should log a warning if the keyword is not found for the URL', () => {
-        const logSpy = sinon.spy(logMock, 'warn');
-        seoChecks.checkForKeywordInclusion('https://unknown.com', {});
-
-        expect(logSpy.calledOnce).to.be.true;
-        expect(logSpy.firstCall.args[0]).to.equal('Keyword Inclusion check failed, keyword not found for https://unknown.com');
-      });
-    });
-
     describe('checkForUniqueness', () => {
       it('should detect duplicate tags', () => {
         const pageTags1 = {
@@ -182,6 +158,7 @@ describe('Meta Tags', () => {
       });
     });
   });
+
   describe('handler method', () => {
     let message;
     let context;
@@ -279,7 +256,10 @@ describe('Meta Tags', () => {
       }).returns({
         promise: sinon.stub().resolves({
           Body: {
-            rawBody: '<html lang="en"><head><meta name="description" content=""><title>Test Page</title></head><body></body></html>',
+            tags: {
+              title: 'Test Page',
+              description: '',
+            },
           },
         }),
       });
@@ -289,7 +269,10 @@ describe('Meta Tags', () => {
       }).returns({
         promise: sinon.stub().resolves({
           Body: {
-            rawBody: '<html lang="en"><head><title>Test Page</title></head><body><h1>This is a dummy H1 that is overly length from SEO perspective</h1></body></html>',
+            tags: {
+              title: 'Test Page',
+              description: 'This is a dummy H1 that is overly length from SEO perspective',
+            },
           },
         }),
       });
@@ -332,26 +315,100 @@ describe('Meta Tags', () => {
             seoOpportunityText: 'The description tag on this page has a length of 0 characters, which is below the recommended length of 140-160 characters.',
           },
           {
-            pageUrl: '/blog/page1',
+            pageUrl: '/blog/page2',
             tagName: 'description',
             tagContent: '',
             seoImpact: 'High',
-            seoOpportunityText: "The description tag on this page is missing the page's top keyword 'page'. It's recommended to include the primary keyword in the description tag.",
+            seoOpportunityText: "The description tag on this page is missing. It's recommended to have a description tag on each page.",
           },
+        ],
+        h1: [
           {
-            pageUrl: '/blog/page2',
-            tagName: 'description',
+            pageUrl: '/blog/page1',
+            tagName: 'h1',
             tagContent: '',
             seoImpact: 'High',
-            seoOpportunityText: "The description tag on this page is missing. It's recommended to have a description tag on each page.",
+            seoOpportunityText: "The h1 tag on this page is missing. It's recommended to have a h1 tag on each page.",
           },
           {
             pageUrl: '/blog/page2',
-            tagName: 'description',
-            seoImpact: 'High',
-            seoOpportunityText: "The description tag on this page is missing the page's top keyword 'test'. It's recommended to include the primary keyword in the description tag.",
+            tagName: 'h1',
+            tagContent: 'This is a dummy H1 that is overly length from SEO perspective',
+            seoImpact: 'Moderate',
+            seoOpportunityText: 'The h1 tag on this page has a length of 61 characters, which is above the recommended length of 60 characters.',
           },
         ],
+      }));
+      expect(addAuditStub.calledOnce).to.be.true;
+      expect(logStub.info.calledTwice).to.be.true;
+    });
+
+    it('should process site tags and perform SEO checks for pages with invalid H1s', async () => {
+      const site = { isLive: sinon.stub().returns(true), getId: sinon.stub().returns('site-id') };
+      const topPages = [{ getURL: 'http://example.com/blog/page1', getTopKeyword: sinon.stub().returns('page') },
+        { getURL: 'http://example.com/blog/page2', getTopKeyword: sinon.stub().returns('Test') }];
+
+      dataAccessStub.getSiteByID.resolves(site);
+      dataAccessStub.getConfiguration.resolves({
+        isHandlerEnabledForSite: sinon.stub().returns(true),
+      });
+      dataAccessStub.getTopPagesForSite.resolves(topPages);
+
+      s3ClientStub.send
+        .withArgs(sinon.match.instanceOf(ListObjectsV2Command).and(sinon.match.has('input', {
+          Bucket: 'test-bucket',
+          Prefix: 'scrapes/site-id/',
+          MaxKeys: 1000,
+        })))
+        .resolves({
+          Contents: [
+            { Key: 'scrapes/site-id/blog/page1.json' },
+            { Key: 'scrapes/site-id/blog/page2.json' },
+          ],
+        });
+
+      s3ClientStub.getObject.withArgs({
+        Bucket: 'test-bucket',
+        Key: 'scrapes/site-id/blog/page1.json',
+      }).returns({
+        promise: sinon.stub().resolves({
+          Body: {
+            tags: {
+              title: 'This is an SEO optimal page1 valid title.',
+              description: 'This is a dummy description that is optimal from SEO perspective for page1. It has the correct length of characters, and is unique across all pages.',
+              h1: [
+                'This is an overly long H1 tag from SEO perspective due to its length exceeding 60 chars',
+                'This is second h1 tag on same page',
+              ],
+            },
+          },
+        }),
+      });
+      s3ClientStub.getObject.withArgs({
+        Bucket: 'test-bucket',
+        Key: 'scrapes/site-id/blog/page2.json',
+      }).returns({
+        promise: sinon.stub().resolves({
+          Body: {
+            tags: {
+              title: 'This is a SEO wise optimised page2 title.',
+              description: 'This is a dummy description that is optimal from SEO perspective for page2. It has the correct length of characters, and is unique across all pages.',
+              h1: [
+                'This is an overly long H1 tag from SEO perspective',
+              ],
+            },
+          },
+        }),
+      });
+      const addAuditStub = sinon.stub().resolves();
+      dataAccessStub.addAudit = addAuditStub;
+
+      const result = await auditMetaTags(message, context);
+
+      expect(JSON.stringify(result)).to.equal(JSON.stringify(noContent()));
+      expect(addAuditStub.calledWithMatch({
+        title: [],
+        description: [],
         h1: [
           {
             pageUrl: '/blog/page1',
@@ -435,7 +492,7 @@ describe('Meta Tags', () => {
       expect(logStub.error.calledTwice).to.be.true;
     });
 
-    it('should handle gracefully if S3 object is not a html', async () => {
+    it('should handle gracefully if S3 tags object is not valid', async () => {
       const site = { isLive: sinon.stub().returns(true), getId: sinon.stub().returns('site-id') };
       const topPages = [{ getURL: 'http://example.com/blog/page1', getTopKeyword: sinon.stub().returns('page') },
         { getURL: 'http://example.com/blog/page2', getTopKeyword: sinon.stub().returns('Test') }];
@@ -461,7 +518,7 @@ describe('Meta Tags', () => {
       s3ClientStub.getObject.returns({
         promise: sinon.stub().resolves({
           Body: {
-            rawBody: 5,
+            tags: 5,
           },
         }),
       });