Skip to content

Commit

Permalink
feat: seo tags audit
Browse files Browse the repository at this point in the history
  • Loading branch information
dipratap committed Aug 27, 2024
1 parent c6a6207 commit c324d15
Show file tree
Hide file tree
Showing 11 changed files with 1,918 additions and 193 deletions.
998 changes: 805 additions & 193 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
"@adobe/spacecat-shared-http-utils": "1.6.8",
"@adobe/spacecat-shared-rum-api-client": "2.8.0",
"@adobe/spacecat-shared-rum-api-client-v1": "npm:@adobe/[email protected]",
"@aws-sdk/client-s3": "3.627.0",
"@aws-sdk/client-lambda": "3.637.0",
"@aws-sdk/credential-provider-node": "3.637.0",
"@adobe/spacecat-shared-utils": "1.19.6",
Expand Down
4 changes: 4 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { resolveSecretsName, sqsEventAdapter } from '@adobe/spacecat-shared-util
import { internalServerError, notFound, ok } from '@adobe/spacecat-shared-http-utils';

import sqs from './support/sqs.js';
import s3Client from './support/s3-client.js';
import apex from './apex/handler.js';
import cwv from './cwv/handler.js';
import lhsDesktop from './lhs/handler-desktop.js';
Expand All @@ -29,6 +30,7 @@ import experimentation from './experimentation/handler.js';
import conversion from './conversion/handler.js';
import essExperimentationDaily from './experimentation-ess/daily.js';
import essExperimentationAll from './experimentation-ess/all.js';
import metaTags from './metatags/handler.js';
import opportunities from './opportunities/opportunities.js';
import costs from './costs/handler.js';
import structuredData from './structured-data/handler.js';
Expand All @@ -46,6 +48,7 @@ const HANDLERS = {
conversion,
'experimentation-ess-daily': essExperimentationDaily,
'experimentation-ess-all': essExperimentationAll,
metaTags,
opportunities,
costs,
'structured-data': structuredData,
Expand Down Expand Up @@ -95,5 +98,6 @@ export const main = wrap(run)
.with(dataAccess)
.with(sqsEventAdapter)
.with(sqs)
.with(s3Client)
.with(secrets, { name: resolveSecretsName })
.with(helixStatus);
35 changes: 35 additions & 0 deletions src/metatags/constants.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

// Tag Names
export const TITLE = 'title';
export const DESCRIPTION = 'description';
export const H1 = 'h1';

// SEO impact category
export const HIGH = 'High';
export const MODERATE = 'Moderate';

// Tags lengths
export const TAG_LENGTHS = {
[TITLE]: {
minLength: 40,
maxLength: 60,
},
[DESCRIPTION]: {
minLength: 140,
maxLength: 160,
},
[H1]: {
maxLength: 60,
},
};
121 changes: 121 additions & 0 deletions src/metatags/handler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import {
internalServerError, noContent, notFound, ok,
} from '@adobe/spacecat-shared-http-utils';
import { JSDOM } from 'jsdom';
import { retrieveSiteBySiteId } from '../utils/data-access.js';
import { getObjectFromKey, getObjectKeysUsingPrefix } from '../utils/s3-utils.js';
import { DESCRIPTION, H1, TITLE } from './constants.js';
import SeoChecks from './seo-checks.js';

function extractTagsFromHtml(htmlContent) {
const dom = new JSDOM(htmlContent);
const doc = dom.window.document;

const title = doc.querySelector('title')?.textContent;
const description = doc.querySelector('meta[name="description"]')?.getAttribute('content');
const h1Tags = Array.from(doc.querySelectorAll('h1')).map((h1) => h1.textContent);
return {
[TITLE]: title,
[DESCRIPTION]: description,
[H1]: h1Tags,
};
}

async function fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log) {
const object = await getObjectFromKey(s3Client, bucketName, key, log);
if (!object?.Body?.rawBody || typeof object.Body.rawBody !== 'string') {
log.error(`No Scraped html found in S3 ${key} object`);
return null;
}
const tags = extractTagsFromHtml(object.Body.rawBody);
const pageUrl = key.slice(prefix.length - 1).replace('.json', ''); // Remove the prefix and .json suffix
return {
[pageUrl]: tags,
};
}

export default async function auditMetaTags(message, context) {
const { type, url: siteId } = message;
const {
dataAccess, log, s3Client,
} = context;

try {
log.info(`Received ${type} audit request for siteId: ${siteId}`);
const site = await retrieveSiteBySiteId(dataAccess, siteId, log);
if (!site) {
return notFound('Site not found');
}
if (!site.isLive()) {
log.info(`Site ${siteId} is not live`);
return ok();
}
const configuration = await dataAccess.getConfiguration();
if (!configuration.isHandlerEnabledForSite(type, site)) {
log.info(`Audit type ${type} disabled for site ${siteId}`);
return ok();
}
// Fetch site's scraped content from S3
const bucketName = context.env.S3_BUCKET_NAME;
const prefix = `scrapes/${siteId}/`;
const scrapedObjectKeys = await getObjectKeysUsingPrefix(s3Client, bucketName, prefix);
const extractedTags = {};
for (const key of scrapedObjectKeys) {
// eslint-disable-next-line no-await-in-loop
const pageMetadata = await fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log);
if (pageMetadata) {
Object.assign(extractedTags, pageMetadata);
}
}
if (Object.entries(extractedTags).length === 0) {
log.error(`Failed to extract tags from scraped content for bucket ${bucketName} and prefix ${prefix}`);
return notFound('Site tags data not available');
}
// Fetch keywords for top pages
const topPages = await dataAccess.getTopPagesForSite(siteId, 'ahrefs', 'global');
const keywords = {};
topPages.forEach((page) => {
const endpoint = new URL(page.getURL).pathname;
keywords[endpoint] = page.getTopKeyword();
});
// Perform SEO checks
const seoChecks = new SeoChecks(log, keywords);
for (const [pageUrl, pageTags] of Object.entries(extractedTags)) {
seoChecks.performChecks(pageUrl, pageTags);
}
const detectedTags = seoChecks.getDetectedTags();
// Prepare Audit result
const auditResult = {
detectedTags,
sourceS3Folder: `${bucketName}/${prefix}`,
fullAuditRef: 'na',
};
const auditData = {
siteId: site.getId(),
isLive: site.isLive(),
auditedAt: new Date().toISOString(),
auditType: type,
fullAuditRef: auditResult?.fullAuditRef,
auditResult,
};
// Persist Audit result
await dataAccess.addAudit(auditData);
log.info(`Successfully audited ${siteId} for ${type} type audit`);
return noContent();
} catch (e) {
log.error(`${type} type audit for ${siteId} failed with error: ${e.message}`, e);
return internalServerError(`Internal server error: ${e.message}`);
}
}
Loading

0 comments on commit c324d15

Please sign in to comment.