-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: meta-tags audit #382
base: main
Are you sure you want to change the base?
Changes from all commits
c324d15
ba7a196
1ed4f35
66e8600
e37fbc5
e6fd210
6eefe49
5633c44
9001f1e
4efb76a
0732419
a7a1e00
20d5a2a
e5e448c
7d418a6
531852f
7152b50
84d6b55
c1ff8ac
f17d565
fc6a230
5ff695d
7a36bcf
b59c457
0b71522
55108c1
29e9e6d
3c31b6e
28e461e
0a7b268
1db87f9
b0f0ea0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,5 +10,8 @@ | |
"all": true, | ||
"include": [ | ||
"src/**/*.js" | ||
], | ||
"exclude": [ | ||
"src/metatags/*.js" | ||
] | ||
} |
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,6 +62,7 @@ | |
"@adobe/spacecat-shared-http-utils": "1.6.8", | ||
"@adobe/spacecat-shared-rum-api-client": "2.9.0", | ||
"@adobe/spacecat-shared-rum-api-client-v1": "npm:@adobe/[email protected]", | ||
"@aws-sdk/client-s3": "3.627.0", | ||
"@aws-sdk/client-lambda": "3.637.0", | ||
"@aws-sdk/credential-provider-node": "3.637.0", | ||
"@adobe/spacecat-shared-utils": "1.19.6", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
/* | ||
* Copyright 2024 Adobe. All rights reserved. | ||
* This file is licensed to you under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. You may obtain a copy | ||
* of the License at http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed under | ||
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS | ||
* OF ANY KIND, either express or implied. See the License for the specific language | ||
* governing permissions and limitations under the License. | ||
*/ | ||
|
||
// Tag Names | ||
export const TITLE = 'title'; | ||
export const DESCRIPTION = 'description'; | ||
export const H1 = 'h1'; | ||
|
||
// SEO impact category | ||
export const HIGH = 'High'; | ||
export const MODERATE = 'Moderate'; | ||
|
||
// Audit result constants | ||
export const NON_UNIQUE = 'non-unique'; | ||
export const MISSING_TAGS = 'missing_tags'; | ||
export const EMPTY_TAGS = 'empty_tags'; | ||
export const LENGTH_CHECK_FAIL_TAGS = 'length_check_fail_tags'; | ||
export const DUPLICATE_TAGS = 'duplicate_tags'; | ||
export const MULTIPLE_H1_COUNT = 'multiple_h1_count'; | ||
|
||
// Tags lengths | ||
export const TAG_LENGTHS = { | ||
[TITLE]: { | ||
minLength: 25, | ||
maxLength: 75, | ||
}, | ||
[DESCRIPTION]: { | ||
minLength: 100, | ||
maxLength: 180, | ||
}, | ||
[H1]: { | ||
maxLength: 75, | ||
}, | ||
}; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
/* | ||
* Copyright 2024 Adobe. All rights reserved. | ||
* This file is licensed to you under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. You may obtain a copy | ||
* of the License at http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed under | ||
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS | ||
* OF ANY KIND, either express or implied. See the License for the specific language | ||
* governing permissions and limitations under the License. | ||
*/ | ||
|
||
import { | ||
internalServerError, noContent, notFound, ok, | ||
} from '@adobe/spacecat-shared-http-utils'; | ||
import { retrieveSiteBySiteId } from '../utils/data-access.js'; | ||
import { getObjectFromKey, getObjectKeysUsingPrefix } from '../utils/s3-utils.js'; | ||
import SeoChecks from './seo-checks.js'; | ||
|
||
async function fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log) { | ||
const object = await getObjectFromKey(s3Client, bucketName, key, log); | ||
if (!object?.scrapeResult?.tags || typeof object.scrapeResult.tags !== 'object') { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the tags object is coming from an external source, validating if it is an object safeguards from type errors. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, we have |
||
log.error(`No Scraped tags found in S3 ${key} object`); | ||
return null; | ||
} | ||
const pageUrl = key.slice(prefix.length - 1).replace('scrape.json', ''); // Remove the prefix and .json suffix | ||
return { | ||
[pageUrl]: { | ||
title: object.scrapeResult.tags.title, | ||
description: object.scrapeResult.tags.description, | ||
h1: object.scrapeResult.tags.h1 || [], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use optional chaining to avoid potential errors if
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we have already checked if tags object exists or not in line 22, we can safely use object.scrapeResult.tags object here |
||
}, | ||
}; | ||
} | ||
|
||
export default async function auditMetaTags(message, context) { | ||
const { type, url: siteId } = message; | ||
const { | ||
dataAccess, log, s3Client, | ||
} = context; | ||
|
||
try { | ||
log.info(`Received ${type} audit request for siteId: ${siteId}`); | ||
const site = await retrieveSiteBySiteId(dataAccess, siteId, log); | ||
if (!site) { | ||
return notFound('Site not found'); | ||
} | ||
// if (!site.isLive()) { | ||
// log.info(`Site ${siteId} is not live`); | ||
// return ok(); | ||
// } | ||
const configuration = await dataAccess.getConfiguration(); | ||
if (!configuration.isHandlerEnabledForSite(type, site)) { | ||
log.info(`Audit type ${type} disabled for site ${siteId}`); | ||
return ok(); | ||
} | ||
// Fetch site's scraped content from S3 | ||
const bucketName = context.env.S3_SCRAPER_BUCKET_NAME; | ||
const prefix = `scrapes/${siteId}/`; | ||
const scrapedObjectKeys = await getObjectKeysUsingPrefix(s3Client, bucketName, prefix, log); | ||
const extractedTags = {}; | ||
for (const key of scrapedObjectKeys) { | ||
// eslint-disable-next-line no-await-in-loop | ||
const pageMetadata = await fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log); | ||
if (pageMetadata) { | ||
Object.assign(extractedTags, pageMetadata); | ||
} | ||
} | ||
Comment on lines
+62
to
+68
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason I'm doing sequential invocations instead of parallel is that in parallel execution, all S3 objects would be fetched into memory simultaneously, which could lead to exceeding the allocated memory. With sequential calls, the Nodejs garbage collector has more opportunities to clean up memory after each fetchAndProcessPageObject invocation finishes, reducing the risk of high memory usage. The audits complete in less than 8 seconds, so the execution time remains within acceptable limits. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe |
||
const extractedTagsCount = Object.entries(extractedTags).length; | ||
if (extractedTagsCount === 0) { | ||
ssilare-adobe marked this conversation as resolved.
Show resolved
Hide resolved
|
||
log.error(`Failed to extract tags from scraped content for bucket ${bucketName} and prefix ${prefix}`); | ||
return notFound('Site tags data not available'); | ||
} | ||
log.info(`Performing SEO checks for ${extractedTagsCount} tags`); | ||
// Perform SEO checks | ||
const seoChecks = new SeoChecks(log); | ||
for (const [pageUrl, pageTags] of Object.entries(extractedTags)) { | ||
seoChecks.performChecks(pageUrl, pageTags); | ||
} | ||
seoChecks.finalChecks(); | ||
const detectedTags = seoChecks.getDetectedTags(); | ||
// Prepare Audit result | ||
const auditResult = { | ||
detectedTags, | ||
sourceS3Folder: `${bucketName}/${prefix}`, | ||
fullAuditRef: 'na', | ||
}; | ||
const auditData = { | ||
siteId: site.getId(), | ||
isLive: site.isLive(), | ||
auditedAt: new Date().toISOString(), | ||
auditType: type, | ||
fullAuditRef: auditResult?.fullAuditRef, | ||
auditResult, | ||
}; | ||
// Persist Audit result | ||
await dataAccess.addAudit(auditData); | ||
log.info(`Successfully audited ${siteId} for ${type} type audit`); | ||
return noContent(); | ||
} catch (e) { | ||
log.error(`${type} type audit for ${siteId} failed with error: ${e.message}`, e); | ||
return internalServerError(`Internal server error: ${e.message}`); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
/* | ||
* Copyright 2023 Adobe. All rights reserved. | ||
* This file is licensed to you under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. You may obtain a copy | ||
* of the License at http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed under | ||
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS | ||
* OF ANY KIND, either express or implied. See the License for the specific language | ||
* governing permissions and limitations under the License. | ||
*/ | ||
|
||
import { | ||
DESCRIPTION, TITLE, H1, TAG_LENGTHS, MISSING_TAGS, EMPTY_TAGS, | ||
LENGTH_CHECK_FAIL_TAGS, DUPLICATE_TAGS, MULTIPLE_H1_COUNT, | ||
} from './constants.js'; | ||
|
||
class SeoChecks { | ||
constructor(log) { | ||
this.log = log; | ||
this.detectedTags = { | ||
[TITLE]: {}, | ||
[DESCRIPTION]: {}, | ||
[H1]: {}, | ||
}; | ||
this.allTags = { | ||
[TITLE]: {}, | ||
[DESCRIPTION]: {}, | ||
[H1]: {}, | ||
}; | ||
} | ||
|
||
/** | ||
* Creates a message for length checks. | ||
* @param {string} tagName - The name of the tag (e.g., 'title', 'description', 'h1'). | ||
* @param {string} tagContent - The content of the tag. | ||
* @returns {string} - The message indicating the tag length issue. | ||
*/ | ||
static createLengthCheckText(tagName, tagContent = '') { | ||
let status = 'within'; | ||
if (tagContent.length < TAG_LENGTHS[tagName].minLength) { | ||
status = 'below'; | ||
} else if (tagContent.length > TAG_LENGTHS[tagName].maxLength) { | ||
status = 'above'; | ||
} | ||
const minLength = TAG_LENGTHS[tagName].minLength ? `${TAG_LENGTHS[tagName].minLength}-` : ''; | ||
return `The ${tagName} tag on this page has a length of ${tagContent.length} characters, which is ${status} the recommended length of ${minLength}${TAG_LENGTHS[tagName].maxLength} characters.`; | ||
} | ||
|
||
/** | ||
* Checks for missing tags on the page and adds to detected tags array if found lacking. | ||
* @param {string} url - The URL of the page. | ||
* @param {object} pageTags - An object containing the tags of the page. | ||
*/ | ||
checkForMissingTags(url, pageTags) { | ||
[TITLE, DESCRIPTION, H1].forEach((tagName) => { | ||
if (pageTags[tagName] === undefined | ||
|| (Array.isArray(pageTags[tagName]) && pageTags[tagName].length === 0)) { | ||
this.detectedTags[tagName][MISSING_TAGS] ??= { pageUrls: [] }; | ||
this.detectedTags[tagName][MISSING_TAGS].pageUrls.push(url); | ||
} | ||
}); | ||
} | ||
|
||
/** | ||
* Checks if tag lengths are within recommended limits | ||
* and adds to detected tags array if found lacking. | ||
* @param {string} url - The URL of the page. | ||
* @param {object} pageTags - An object containing the tags of the page. | ||
*/ | ||
checkForTagsLength(url, pageTags) { | ||
const checkTag = (tagName, tagContent) => { | ||
if (tagContent === '') { | ||
this.detectedTags[tagName][EMPTY_TAGS] ??= { pageUrls: [] }; | ||
this.detectedTags[tagName][EMPTY_TAGS].pageUrls.push(url); | ||
} else if (tagContent?.length > TAG_LENGTHS[tagName].maxLength | ||
|| tagContent?.length < TAG_LENGTHS[tagName].minLength) { | ||
this.detectedTags[tagName][LENGTH_CHECK_FAIL_TAGS] ??= {}; | ||
this.detectedTags[tagName][LENGTH_CHECK_FAIL_TAGS].url = url; | ||
this.detectedTags[tagName][LENGTH_CHECK_FAIL_TAGS].tagContent = tagContent; | ||
} | ||
}; | ||
checkTag(TITLE, pageTags[TITLE]); | ||
checkTag(DESCRIPTION, pageTags[DESCRIPTION]); | ||
checkTag(H1, pageTags[H1][0]); | ||
} | ||
|
||
/** | ||
* Checks if there are more than one H1 tags and adds to detected tags array if found lacking. | ||
* @param {string} url - The URL of the page. | ||
* @param {object} pageTags - An object containing the tags of the page. | ||
*/ | ||
checkForH1Count(url, pageTags) { | ||
if (pageTags[H1]?.length > 1) { | ||
this.detectedTags[H1][MULTIPLE_H1_COUNT] ??= []; | ||
this.detectedTags[H1][MULTIPLE_H1_COUNT].push({ | ||
pageUrl: url, | ||
tagContent: JSON.stringify(pageTags[H1]), | ||
}); | ||
} | ||
} | ||
|
||
/** | ||
* Checks for tag uniqueness and adds to detected tags array if found lacking. | ||
*/ | ||
checkForUniqueness() { | ||
[TITLE, DESCRIPTION, H1].forEach((tagName) => { | ||
Object.values(this.allTags[tagName]).forEach((value) => { | ||
if (value?.pageUrls?.size > 1) { | ||
this.detectedTags[tagName][DUPLICATE_TAGS] ??= []; | ||
this.detectedTags[tagName][DUPLICATE_TAGS].push({ | ||
tagContent: value.tagContent, | ||
pageUrls: Array.from(value.pageUrls), | ||
}); | ||
} | ||
}); | ||
}); | ||
} | ||
|
||
/** | ||
* Adds tag data entry to all Tags Object | ||
* @param url | ||
* @param tagName | ||
* @param tagContent | ||
*/ | ||
addToAllTags(url, tagName, tagContent) { | ||
if (!tagContent) { | ||
return; | ||
} | ||
const tagContentLowerCase = tagContent.toLowerCase(); | ||
this.allTags[tagName][tagContentLowerCase] ??= { | ||
pageUrls: new Set(), | ||
tagContent, | ||
}; | ||
this.allTags[tagName][tagContentLowerCase].pageUrls.add(url); | ||
} | ||
|
||
/** | ||
* Performs all SEO checks on the provided tags. | ||
* @param {string} url - The URL of the page. | ||
* @param {object} pageTags - An object containing the tags of the page. | ||
*/ | ||
performChecks(url, pageTags) { | ||
this.checkForMissingTags(url, pageTags); | ||
this.checkForTagsLength(url, pageTags); | ||
this.checkForH1Count(url, pageTags); | ||
// store tag data in all tags object to be used in later checks like uniqueness | ||
this.addToAllTags(TITLE, pageTags[TITLE]); | ||
this.addToAllTags(DESCRIPTION, pageTags[DESCRIPTION]); | ||
pageTags[H1].forEach((tagContent) => this.addToAllTags(H1, tagContent)); | ||
} | ||
|
||
/** | ||
* Gets the detected tags for the site. | ||
* @returns {object} - The detected tags object. | ||
*/ | ||
getDetectedTags() { | ||
return this.detectedTags; | ||
} | ||
|
||
finalChecks() { | ||
this.checkForUniqueness(); | ||
} | ||
|
||
/** | ||
* Processes detected tags, including sorting non-unique H1 tags. | ||
*/ | ||
// organizeDetectedTags() { | ||
// } | ||
} | ||
|
||
export default SeoChecks; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/* | ||
* Copyright 2024 Adobe. All rights reserved. | ||
* This file is licensed to you under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. You may obtain a copy | ||
* of the License at http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed under | ||
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS | ||
* OF ANY KIND, either express or implied. See the License for the specific language | ||
* governing permissions and limitations under the License. | ||
*/ | ||
|
||
import { S3Client } from '@aws-sdk/client-s3'; | ||
|
||
/** | ||
* Adds an S3Client instance to the context. | ||
* | ||
* @returns {function(object, UniversalContext): Promise<Response>} | ||
*/ | ||
export default function s3Client(fn) { | ||
return async (request, context) => { | ||
if (!context.s3Client) { | ||
context.s3Client = new S3Client(); | ||
} | ||
return fn(request, context); | ||
}; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you please add functional comment to this above the definition?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure