Skip to content

Commit

Permalink
feat: remove keyword inclusion check, add support for s3 scraped tags…
Browse files Browse the repository at this point in the history
… object
  • Loading branch information
dipratap committed Aug 29, 2024
1 parent 66e8600 commit e37fbc5
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 101 deletions.
36 changes: 8 additions & 28 deletions src/metatags/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,23 @@
import {
internalServerError, noContent, notFound, ok,
} from '@adobe/spacecat-shared-http-utils';
import { JSDOM } from 'jsdom';
import { retrieveSiteBySiteId } from '../utils/data-access.js';
import { getObjectFromKey, getObjectKeysUsingPrefix } from '../utils/s3-utils.js';
import { DESCRIPTION, H1, TITLE } from './constants.js';
import SeoChecks from './seo-checks.js';

function extractTagsFromHtml(htmlContent) {
const dom = new JSDOM(htmlContent);
const doc = dom.window.document;

const title = doc.querySelector('title')?.textContent;
const description = doc.querySelector('meta[name="description"]')?.getAttribute('content');
const h1Tags = Array.from(doc.querySelectorAll('h1')).map((h1) => h1.textContent);
return {
[TITLE]: title,
[DESCRIPTION]: description,
[H1]: h1Tags,
};
}

async function fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log) {
const object = await getObjectFromKey(s3Client, bucketName, key, log);
if (!object?.Body?.rawBody || typeof object.Body.rawBody !== 'string') {
log.error(`No Scraped html found in S3 ${key} object`);
if (!object?.Body?.tags || typeof object.Body.tags !== 'object') {
log.error(`No Scraped tags found in S3 ${key} object`);
return null;
}
const tags = extractTagsFromHtml(object.Body.rawBody);
const pageUrl = key.slice(prefix.length - 1).replace('.json', ''); // Remove the prefix and .json suffix
return {
[pageUrl]: tags,
[pageUrl]: {
title: object.Body.tags.title,
description: object.Body.tags.description,
h1: object.Body.tags.h1 || [],
},
};
}

Expand Down Expand Up @@ -83,15 +70,8 @@ export default async function auditMetaTags(message, context) {
log.error(`Failed to extract tags from scraped content for bucket ${bucketName} and prefix ${prefix}`);
return notFound('Site tags data not available');
}
// Fetch keywords for top pages
const topPages = await dataAccess.getTopPagesForSite(siteId, 'ahrefs', 'global');
const keywords = {};
topPages.forEach((page) => {
const endpoint = new URL(page.getURL).pathname;
keywords[endpoint] = page.getTopKeyword();
});
// Perform SEO checks
const seoChecks = new SeoChecks(log, keywords);
const seoChecks = new SeoChecks(log);
for (const [pageUrl, pageTags] of Object.entries(extractedTags)) {
seoChecks.performChecks(pageUrl, pageTags);
}
Expand Down
38 changes: 2 additions & 36 deletions src/metatags/seo-checks.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@ import {
} from './constants.js';

class SeoChecks {
constructor(log, keywords) {
constructor(log) {
this.log = log;
this.keywords = keywords;
this.detectedTags = {
[TITLE]: [],
[DESCRIPTION]: [],
Expand Down Expand Up @@ -131,45 +130,13 @@ class SeoChecks {
this.addDetectedTagEntry(
url,
H1,
pageTags[H1],
JSON.stringify(pageTags[H1]),
MODERATE,
`There are ${pageTags[H1].length} H1 tags on this page, which is more than the recommended count of 1.`,
);
}
}

/**
* Checks for keyword inclusion in the tags and adds to detected tags array if found lacking.
* @param {string} url - The URL of the page.
* @param {object} pageTags - An object containing the tags of the page.
*/
checkForKeywordInclusion(url, pageTags) {
if (!this.keywords[url]) {
this.log.warn(`Keyword Inclusion check failed, keyword not found for ${url}`);
return;
}
const keyword = this.keywords[url].toLowerCase();

const tags = {
[TITLE]: pageTags[TITLE],
[DESCRIPTION]: pageTags[DESCRIPTION],
[H1]: pageTags[H1][0],
};

Object.entries(tags).forEach(([tagName, tagContent]) => {
if (!tagContent?.toLowerCase().includes(keyword)) {
this.addDetectedTagEntry(
url,
tagName,
tagContent,
HIGH,
`The ${tagName} tag on this page is missing the page's top keyword '${keyword}'. `
+ `It's recommended to include the primary keyword in the ${tagName} tag.`,
);
}
});
}

/**
* Checks for tag uniqueness and adds to detected tags array if found lacking.
* @param {object} pageTags - An object containing the tags of the page.
Expand Down Expand Up @@ -205,7 +172,6 @@ class SeoChecks {
this.checkForMissingTags(url, pageTags);
this.checkForTagsLength(url, pageTags);
this.checkForH1Count(url, pageTags);
this.checkForKeywordInclusion(url, pageTags);
this.checkForUniqueness(url, pageTags);
}

Expand Down
131 changes: 94 additions & 37 deletions test/audits/metatags.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -128,37 +128,13 @@ describe('Meta Tags', () => {
expect(seoChecks.detectedTags[H1][0]).to.deep.equal({
pageUrl: 'https://example.com',
tagName: H1,
tagContent: ['First H1', 'Second H1'],
tagContent: JSON.stringify(['First H1', 'Second H1']),
seoImpact: MODERATE,
seoOpportunityText: 'There are 2 H1 tags on this page, which is more than the recommended count of 1.',
});
});
});

describe('checkForKeywordInclusion', () => {
it('should detect missing keywords in tags', () => {
const pageTags = {
[TITLE]: 'Some other title',
[DESCRIPTION]: 'Some other description',
[H1]: ['Some other H1'],
};

seoChecks.checkForKeywordInclusion('https://example.com', pageTags);

expect(seoChecks.detectedTags[TITLE]).to.have.lengthOf(1);
expect(seoChecks.detectedTags[DESCRIPTION]).to.have.lengthOf(1);
expect(seoChecks.detectedTags[H1]).to.have.lengthOf(1);
});

it('should log a warning if the keyword is not found for the URL', () => {
const logSpy = sinon.spy(logMock, 'warn');
seoChecks.checkForKeywordInclusion('https://unknown.com', {});

expect(logSpy.calledOnce).to.be.true;
expect(logSpy.firstCall.args[0]).to.equal('Keyword Inclusion check failed, keyword not found for https://unknown.com');
});
});

describe('checkForUniqueness', () => {
it('should detect duplicate tags', () => {
const pageTags1 = {
Expand All @@ -182,6 +158,7 @@ describe('Meta Tags', () => {
});
});
});

describe('handler method', () => {
let message;
let context;
Expand Down Expand Up @@ -279,7 +256,10 @@ describe('Meta Tags', () => {
}).returns({
promise: sinon.stub().resolves({
Body: {
rawBody: '<html lang="en"><head><meta name="description" content=""><title>Test Page</title></head><body></body></html>',
tags: {
title: 'Test Page',
description: '',
},
},
}),
});
Expand All @@ -289,7 +269,10 @@ describe('Meta Tags', () => {
}).returns({
promise: sinon.stub().resolves({
Body: {
rawBody: '<html lang="en"><head><title>Test Page</title></head><body><h1>This is a dummy H1 that is overly length from SEO perspective</h1></body></html>',
tags: {
title: 'Test Page',
description: 'This is a dummy H1 that is overly length from SEO perspective',
},
},
}),
});
Expand Down Expand Up @@ -332,26 +315,100 @@ describe('Meta Tags', () => {
seoOpportunityText: 'The description tag on this page has a length of 0 characters, which is below the recommended length of 140-160 characters.',
},
{
pageUrl: '/blog/page1',
pageUrl: '/blog/page2',
tagName: 'description',
tagContent: '',
seoImpact: 'High',
seoOpportunityText: "The description tag on this page is missing the page's top keyword 'page'. It's recommended to include the primary keyword in the description tag.",
seoOpportunityText: "The description tag on this page is missing. It's recommended to have a description tag on each page.",
},
],
h1: [
{
pageUrl: '/blog/page2',
tagName: 'description',
pageUrl: '/blog/page1',
tagName: 'h1',
tagContent: '',
seoImpact: 'High',
seoOpportunityText: "The description tag on this page is missing. It's recommended to have a description tag on each page.",
seoOpportunityText: "The h1 tag on this page is missing. It's recommended to have a h1 tag on each page.",
},
{
pageUrl: '/blog/page2',
tagName: 'description',
seoImpact: 'High',
seoOpportunityText: "The description tag on this page is missing the page's top keyword 'test'. It's recommended to include the primary keyword in the description tag.",
tagName: 'h1',
tagContent: 'This is a dummy H1 that is overly length from SEO perspective',
seoImpact: 'Moderate',
seoOpportunityText: 'The h1 tag on this page has a length of 61 characters, which is above the recommended length of 60 characters.',
},
],
}));
expect(addAuditStub.calledOnce).to.be.true;
expect(logStub.info.calledTwice).to.be.true;
});

it('should process site tags and perform SEO checks for pages with invalid H1s', async () => {
const site = { isLive: sinon.stub().returns(true), getId: sinon.stub().returns('site-id') };
const topPages = [{ getURL: 'http://example.com/blog/page1', getTopKeyword: sinon.stub().returns('page') },
{ getURL: 'http://example.com/blog/page2', getTopKeyword: sinon.stub().returns('Test') }];

dataAccessStub.getSiteByID.resolves(site);
dataAccessStub.getConfiguration.resolves({
isHandlerEnabledForSite: sinon.stub().returns(true),
});
dataAccessStub.getTopPagesForSite.resolves(topPages);

s3ClientStub.send
.withArgs(sinon.match.instanceOf(ListObjectsV2Command).and(sinon.match.has('input', {
Bucket: 'test-bucket',
Prefix: 'scrapes/site-id/',
MaxKeys: 1000,
})))
.resolves({
Contents: [
{ Key: 'scrapes/site-id/blog/page1.json' },
{ Key: 'scrapes/site-id/blog/page2.json' },
],
});

s3ClientStub.getObject.withArgs({
Bucket: 'test-bucket',
Key: 'scrapes/site-id/blog/page1.json',
}).returns({
promise: sinon.stub().resolves({
Body: {
tags: {
title: 'This is an SEO optimal page1 valid title.',
description: 'This is a dummy description that is optimal from SEO perspective for page1. It has the correct length of characters, and is unique across all pages.',
h1: [
'This is an overly long H1 tag from SEO perspective due to its length exceeding 60 chars',
'This is second h1 tag on same page',
],
},
},
}),
});
s3ClientStub.getObject.withArgs({
Bucket: 'test-bucket',
Key: 'scrapes/site-id/blog/page2.json',
}).returns({
promise: sinon.stub().resolves({
Body: {
tags: {
title: 'This is a SEO wise optimised page2 title.',
description: 'This is a dummy description that is optimal from SEO perspective for page2. It has the correct length of characters, and is unique across all pages.',
h1: [
'This is an overly long H1 tag from SEO perspective',
],
},
},
}),
});
const addAuditStub = sinon.stub().resolves();
dataAccessStub.addAudit = addAuditStub;

const result = await auditMetaTags(message, context);

expect(JSON.stringify(result)).to.equal(JSON.stringify(noContent()));
expect(addAuditStub.calledWithMatch({
title: [],
description: [],
h1: [
{
pageUrl: '/blog/page1',
Expand Down Expand Up @@ -435,7 +492,7 @@ describe('Meta Tags', () => {
expect(logStub.error.calledTwice).to.be.true;
});

it('should handle gracefully if S3 object is not a html', async () => {
it('should handle gracefully if S3 tags object is not valid', async () => {
const site = { isLive: sinon.stub().returns(true), getId: sinon.stub().returns('site-id') };
const topPages = [{ getURL: 'http://example.com/blog/page1', getTopKeyword: sinon.stub().returns('page') },
{ getURL: 'http://example.com/blog/page2', getTopKeyword: sinon.stub().returns('Test') }];
Expand All @@ -461,7 +518,7 @@ describe('Meta Tags', () => {
s3ClientStub.getObject.returns({
promise: sinon.stub().resolves({
Body: {
rawBody: 5,
tags: 5,
},
}),
});
Expand Down

0 comments on commit e37fbc5

Please sign in to comment.