Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: meta-tags audit #382

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c324d15
feat: seo tags audit
dipratap Aug 27, 2024
ba7a196
feat: renaming vars
dipratap Aug 27, 2024
1ed4f35
Merge branch 'main' of github.com:adobe/spacecat-audit-worker into si…
dipratap Aug 28, 2024
66e8600
Merge branch 'main' of github.com:adobe/spacecat-audit-worker into si…
dipratap Aug 29, 2024
e37fbc5
feat: remove keyword inclusion check, add support for s3 scraped tags…
dipratap Aug 29, 2024
e6fd210
feat: update S3 bucket name
dipratap Aug 29, 2024
6eefe49
feat: fix s3 get object
dipratap Aug 29, 2024
5633c44
Merge branch 'main' into sites-23343
dipratap Sep 2, 2024
9001f1e
feat: scrape suffix in s3 filenames
dipratap Sep 2, 2024
4efb76a
Merge branch 'sites-23343' of github.com:adobe/spacecat-audit-worker …
dipratap Sep 2, 2024
0732419
feat: temp log add
dipratap Sep 2, 2024
a7a1e00
feat: temp log add
dipratap Sep 2, 2024
20d5a2a
feat: handle s3 get object
dipratap Sep 2, 2024
e5e448c
feat: adding temp logging
dipratap Sep 2, 2024
7d418a6
feat: fixing uts
dipratap Sep 2, 2024
531852f
feat: adding info log
dipratap Sep 2, 2024
7152b50
feat: adding info log
dipratap Sep 2, 2024
84d6b55
feat: adding info log
dipratap Sep 2, 2024
c1ff8ac
feat: removing temp info log
dipratap Sep 4, 2024
f17d565
feat: main merge
dipratap Sep 4, 2024
fc6a230
feat: temp change for site not live
dipratap Sep 10, 2024
5ff695d
feat: changes for reporting
dipratap Sep 12, 2024
7a36bcf
feat: changes for reporting
dipratap Sep 12, 2024
b59c457
feat: organize detected tags
dipratap Sep 12, 2024
0b71522
feat: new audit result schema
dipratap Sep 18, 2024
55108c1
fix: issues
dipratap Sep 18, 2024
29e9e6d
fix: issues
dipratap Sep 18, 2024
3c31b6e
fix: add temporary debug logs
dipratap Sep 30, 2024
28e461e
fix: add temporary debug logs
dipratap Sep 30, 2024
0a7b268
fix: add temporary debug logs
dipratap Sep 30, 2024
1db87f9
fix: detected tags structure
dipratap Sep 30, 2024
b0f0ea0
fix: revert temp debug logs
dipratap Sep 30, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .nycrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,8 @@
"all": true,
"include": [
"src/**/*.js"
],
"exclude": [
"src/metatags/*.js"
]
}
998 changes: 805 additions & 193 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
"@adobe/spacecat-shared-http-utils": "1.6.8",
"@adobe/spacecat-shared-rum-api-client": "2.9.0",
"@adobe/spacecat-shared-rum-api-client-v1": "npm:@adobe/[email protected]",
"@aws-sdk/client-s3": "3.627.0",
"@aws-sdk/client-lambda": "3.637.0",
"@aws-sdk/credential-provider-node": "3.637.0",
"@adobe/spacecat-shared-utils": "1.19.6",
Expand Down
4 changes: 4 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import { resolveSecretsName, sqsEventAdapter } from '@adobe/spacecat-shared-util
import { internalServerError, notFound, ok } from '@adobe/spacecat-shared-http-utils';

import sqs from './support/sqs.js';
import s3Client from './support/s3-client.js';
import apex from './apex/handler.js';
import cwv from './cwv/handler.js';
import lhsDesktop from './lhs/handler-desktop.js';
Expand All @@ -30,6 +31,7 @@ import conversion from './conversion/handler.js';
import essExperimentationDaily from './experimentation-ess/daily.js';
import essExperimentationAll from './experimentation-ess/all.js';
import experimentationOpportunities from './experimentation-opportunities/experimentation-opportunities.js';
import metaTags from './metatags/handler.js';
import costs from './costs/handler.js';
import structuredData from './structured-data/handler.js';

Expand All @@ -47,6 +49,7 @@ const HANDLERS = {
'experimentation-ess-daily': essExperimentationDaily,
'experimentation-ess-all': essExperimentationAll,
'experimentation-opportunities': experimentationOpportunities,
'meta-tags': metaTags,
costs,
'structured-data': structuredData,
dummy: (message) => ok(message),
Expand Down Expand Up @@ -95,5 +98,6 @@ export const main = wrap(run)
.with(dataAccess)
.with(sqsEventAdapter)
.with(sqs)
.with(s3Client)
.with(secrets, { name: resolveSecretsName })
.with(helixStatus);
43 changes: 43 additions & 0 deletions src/metatags/constants.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

// Tag Names
export const TITLE = 'title';
export const DESCRIPTION = 'description';
export const H1 = 'h1';

// SEO impact category
export const HIGH = 'High';
export const MODERATE = 'Moderate';

// Audit result constants
export const NON_UNIQUE = 'non-unique';
export const MISSING_TAGS = 'missing_tags';
export const EMPTY_TAGS = 'empty_tags';
export const LENGTH_CHECK_FAIL_TAGS = 'length_check_fail_tags';
export const DUPLICATE_TAGS = 'duplicate_tags';
export const MULTIPLE_H1_COUNT = 'multiple_h1_count';

// Tags lengths
export const TAG_LENGTHS = {
[TITLE]: {
minLength: 25,
maxLength: 75,
},
[DESCRIPTION]: {
minLength: 100,
maxLength: 180,
},
[H1]: {
maxLength: 75,
},
};
104 changes: 104 additions & 0 deletions src/metatags/handler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import {
internalServerError, noContent, notFound, ok,
} from '@adobe/spacecat-shared-http-utils';
import { retrieveSiteBySiteId } from '../utils/data-access.js';
import { getObjectFromKey, getObjectKeysUsingPrefix } from '../utils/s3-utils.js';
import SeoChecks from './seo-checks.js';

async function fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please add functional comment to this above the definition?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure

const object = await getObjectFromKey(s3Client, bucketName, key, log);
if (!object?.scrapeResult?.tags || typeof object.scrapeResult.tags !== 'object') {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ?. optional chaining is great for preventing errors, but after checking for object?.scrapeResult?.tags, the typeof object.scrapeResult.tags !== 'object' could be unnecessary.

if (!object?.scrapeResult?.tags) 

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the tags object is coming from an external source, validating if it is an object safeguards from type errors.

Copy link
Member

@solaris007 solaris007 Sep 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, we have isObject or isNonEmptyObject in spacecat-shared-utils

log.error(`No Scraped tags found in S3 ${key} object`);
return null;
}
const pageUrl = key.slice(prefix.length - 1).replace('scrape.json', ''); // Remove the prefix and .json suffix
return {
[pageUrl]: {
title: object.scrapeResult.tags.title,
description: object.scrapeResult.tags.description,
h1: object.scrapeResult.tags.h1 || [],

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use optional chaining to avoid potential errors if h1 doesn’t exist in tags.

h1: object.scrapeResult.tags?.h1 || [],

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we have already checked if tags object exists or not in line 22, we can safely use object.scrapeResult.tags object here

},
};
}

export default async function auditMetaTags(message, context) {
const { type, url: siteId } = message;
const {
dataAccess, log, s3Client,
} = context;

try {
log.info(`Received ${type} audit request for siteId: ${siteId}`);
const site = await retrieveSiteBySiteId(dataAccess, siteId, log);
if (!site) {
return notFound('Site not found');
}
// if (!site.isLive()) {
// log.info(`Site ${siteId} is not live`);
// return ok();
// }
const configuration = await dataAccess.getConfiguration();
if (!configuration.isHandlerEnabledForSite(type, site)) {
log.info(`Audit type ${type} disabled for site ${siteId}`);
return ok();
}
// Fetch site's scraped content from S3
const bucketName = context.env.S3_SCRAPER_BUCKET_NAME;
const prefix = `scrapes/${siteId}/`;
const scrapedObjectKeys = await getObjectKeysUsingPrefix(s3Client, bucketName, prefix, log);
const extractedTags = {};
for (const key of scrapedObjectKeys) {
// eslint-disable-next-line no-await-in-loop
const pageMetadata = await fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log);
if (pageMetadata) {
Object.assign(extractedTags, pageMetadata);
}
}
Comment on lines +62 to +68

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using await inside a loop can lead to performance issues as it blocks the iteration. Consider refactoring to Promise.all() for parallel execution, improving efficiency.

const pageMetadataPromises = scrapedObjectKeys.map(key => fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log));
const pageMetadataResults = await Promise.all(pageMetadataPromises);
pageMetadataResults.forEach(pageMetadata => {
  if (pageMetadata) {
    Object.assign(extractedTags, pageMetadata);
  }
});

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason I'm doing sequential invocations instead of parallel is that in parallel execution, all S3 objects would be fetched into memory simultaneously, which could lead to exceeding the allocated memory. With sequential calls, the Nodejs garbage collector has more opportunities to clean up memory after each fetchAndProcessPageObject invocation finishes, reducing the risk of high memory usage. The audits complete in less than 8 seconds, so the execution time remains within acceptable limits.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe Promise.all is worth a test, as i doubt we'll quickly exceed the allocated 4GB for top 200 pages. this would give a DOM-size per page of 20 kB... additional consideration would be rate limiting towards the S3 API though.

const extractedTagsCount = Object.entries(extractedTags).length;
if (extractedTagsCount === 0) {
ssilare-adobe marked this conversation as resolved.
Show resolved Hide resolved
log.error(`Failed to extract tags from scraped content for bucket ${bucketName} and prefix ${prefix}`);
return notFound('Site tags data not available');
}
log.info(`Performing SEO checks for ${extractedTagsCount} tags`);
// Perform SEO checks
const seoChecks = new SeoChecks(log);
for (const [pageUrl, pageTags] of Object.entries(extractedTags)) {
seoChecks.performChecks(pageUrl, pageTags);
}
seoChecks.finalChecks();
const detectedTags = seoChecks.getDetectedTags();
// Prepare Audit result
const auditResult = {
detectedTags,
sourceS3Folder: `${bucketName}/${prefix}`,
fullAuditRef: 'na',
};
const auditData = {
siteId: site.getId(),
isLive: site.isLive(),
auditedAt: new Date().toISOString(),
auditType: type,
fullAuditRef: auditResult?.fullAuditRef,
auditResult,
};
// Persist Audit result
await dataAccess.addAudit(auditData);
log.info(`Successfully audited ${siteId} for ${type} type audit`);
return noContent();
} catch (e) {
log.error(`${type} type audit for ${siteId} failed with error: ${e.message}`, e);
return internalServerError(`Internal server error: ${e.message}`);
}
}
172 changes: 172 additions & 0 deletions src/metatags/seo-checks.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
/*
* Copyright 2023 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import {
DESCRIPTION, TITLE, H1, TAG_LENGTHS, MISSING_TAGS, EMPTY_TAGS,
LENGTH_CHECK_FAIL_TAGS, DUPLICATE_TAGS, MULTIPLE_H1_COUNT,
} from './constants.js';

class SeoChecks {
constructor(log) {
this.log = log;
this.detectedTags = {
[TITLE]: {},
[DESCRIPTION]: {},
[H1]: {},
};
this.allTags = {
[TITLE]: {},
[DESCRIPTION]: {},
[H1]: {},
};
}

/**
* Creates a message for length checks.
* @param {string} tagName - The name of the tag (e.g., 'title', 'description', 'h1').
* @param {string} tagContent - The content of the tag.
* @returns {string} - The message indicating the tag length issue.
*/
static createLengthCheckText(tagName, tagContent = '') {
let status = 'within';
if (tagContent.length < TAG_LENGTHS[tagName].minLength) {
status = 'below';
} else if (tagContent.length > TAG_LENGTHS[tagName].maxLength) {
status = 'above';
}
const minLength = TAG_LENGTHS[tagName].minLength ? `${TAG_LENGTHS[tagName].minLength}-` : '';
return `The ${tagName} tag on this page has a length of ${tagContent.length} characters, which is ${status} the recommended length of ${minLength}${TAG_LENGTHS[tagName].maxLength} characters.`;
}

/**
* Checks for missing tags on the page and adds to detected tags array if found lacking.
* @param {string} url - The URL of the page.
* @param {object} pageTags - An object containing the tags of the page.
*/
checkForMissingTags(url, pageTags) {
[TITLE, DESCRIPTION, H1].forEach((tagName) => {
if (pageTags[tagName] === undefined
|| (Array.isArray(pageTags[tagName]) && pageTags[tagName].length === 0)) {
this.detectedTags[tagName][MISSING_TAGS] ??= { pageUrls: [] };
this.detectedTags[tagName][MISSING_TAGS].pageUrls.push(url);
}
});
}

/**
* Checks if tag lengths are within recommended limits
* and adds to detected tags array if found lacking.
* @param {string} url - The URL of the page.
* @param {object} pageTags - An object containing the tags of the page.
*/
checkForTagsLength(url, pageTags) {
const checkTag = (tagName, tagContent) => {
if (tagContent === '') {
this.detectedTags[tagName][EMPTY_TAGS] ??= { pageUrls: [] };
this.detectedTags[tagName][EMPTY_TAGS].pageUrls.push(url);
} else if (tagContent?.length > TAG_LENGTHS[tagName].maxLength
|| tagContent?.length < TAG_LENGTHS[tagName].minLength) {
this.detectedTags[tagName][LENGTH_CHECK_FAIL_TAGS] ??= {};
this.detectedTags[tagName][LENGTH_CHECK_FAIL_TAGS].url = url;
this.detectedTags[tagName][LENGTH_CHECK_FAIL_TAGS].tagContent = tagContent;
}
};
checkTag(TITLE, pageTags[TITLE]);
checkTag(DESCRIPTION, pageTags[DESCRIPTION]);
checkTag(H1, pageTags[H1][0]);
}

/**
* Checks if there are more than one H1 tags and adds to detected tags array if found lacking.
* @param {string} url - The URL of the page.
* @param {object} pageTags - An object containing the tags of the page.
*/
checkForH1Count(url, pageTags) {
if (pageTags[H1]?.length > 1) {
this.detectedTags[H1][MULTIPLE_H1_COUNT] ??= [];
this.detectedTags[H1][MULTIPLE_H1_COUNT].push({
pageUrl: url,
tagContent: JSON.stringify(pageTags[H1]),
});
}
}

/**
* Checks for tag uniqueness and adds to detected tags array if found lacking.
*/
checkForUniqueness() {
[TITLE, DESCRIPTION, H1].forEach((tagName) => {
Object.values(this.allTags[tagName]).forEach((value) => {
if (value?.pageUrls?.size > 1) {
this.detectedTags[tagName][DUPLICATE_TAGS] ??= [];
this.detectedTags[tagName][DUPLICATE_TAGS].push({
tagContent: value.tagContent,
pageUrls: Array.from(value.pageUrls),
});
}
});
});
}

/**
* Adds tag data entry to all Tags Object
* @param url
* @param tagName
* @param tagContent
*/
addToAllTags(url, tagName, tagContent) {
if (!tagContent) {
return;
}
const tagContentLowerCase = tagContent.toLowerCase();
this.allTags[tagName][tagContentLowerCase] ??= {
pageUrls: new Set(),
tagContent,
};
this.allTags[tagName][tagContentLowerCase].pageUrls.add(url);
}

/**
* Performs all SEO checks on the provided tags.
* @param {string} url - The URL of the page.
* @param {object} pageTags - An object containing the tags of the page.
*/
performChecks(url, pageTags) {
this.checkForMissingTags(url, pageTags);
this.checkForTagsLength(url, pageTags);
this.checkForH1Count(url, pageTags);
// store tag data in all tags object to be used in later checks like uniqueness
this.addToAllTags(TITLE, pageTags[TITLE]);
this.addToAllTags(DESCRIPTION, pageTags[DESCRIPTION]);
pageTags[H1].forEach((tagContent) => this.addToAllTags(H1, tagContent));
}

/**
* Gets the detected tags for the site.
* @returns {object} - The detected tags object.
*/
getDetectedTags() {
return this.detectedTags;
}

finalChecks() {
this.checkForUniqueness();
}

/**
* Processes detected tags, including sorting non-unique H1 tags.
*/
// organizeDetectedTags() {
// }
}

export default SeoChecks;
27 changes: 27 additions & 0 deletions src/support/s3-client.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import { S3Client } from '@aws-sdk/client-s3';

/**
* Adds an S3Client instance to the context.
*
* @returns {function(object, UniversalContext): Promise<Response>}
*/
export default function s3Client(fn) {
return async (request, context) => {
if (!context.s3Client) {
context.s3Client = new S3Client();
}
return fn(request, context);
};
}
Loading
Loading