Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce genAI generated broken backlink fixes #348

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
37 changes: 20 additions & 17 deletions src/backlinks/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import AhrefsAPIClient from '@adobe/spacecat-shared-ahrefs-client';
import { AbortController, AbortError } from '@adobe/fetch';
import { retrieveSiteBySiteId } from '../utils/data-access.js';
import { enhanceBacklinksWithFixes, fetch } from '../support/utils.js';
import { obtainSitemapUrls } from '../sitemap/handler.js';

const TIMEOUT = 3000;

Expand Down Expand Up @@ -64,6 +65,8 @@ export default async function auditBrokenBacklinks(message, context) {
const { type, url: siteId, auditContext = {} } = message;
const { dataAccess, log, sqs } = context;
const {
AWS_REGION: region,
SPACECAT_STATISTICS_LAMBDA_ARN: statisticsServiceArn,
AUDIT_RESULTS_QUEUE_URL: queueUrl,
} = context.env;

Expand Down Expand Up @@ -100,23 +103,7 @@ export default async function auditBrokenBacklinks(message, context) {
const filteredBacklinks = result?.backlinks?.filter(
(backlink) => !excludedURLs?.includes(backlink.url_to),
);
let brokenBacklinks = await filterOutValidBacklinks(filteredBacklinks, log);

if (configuration.isHandlerEnabledForSite(`${type}-auto-suggest`, site)) {
try {
const topPages = await dataAccess.getTopPagesForSite(siteId, 'ahrefs', 'global');
const keywords = topPages.map(
(page) => ({
url: page.getURL(),
keyword: page.getTopKeyword(),
traffic: page.getTraffic(),
}),
);
brokenBacklinks = enhanceBacklinksWithFixes(brokenBacklinks, keywords, log);
} catch (e) {
log.error(`Enhancing backlinks with fixes for siteId ${siteId} failed with error: ${e.message}`, e);
}
}
const brokenBacklinks = await filterOutValidBacklinks(filteredBacklinks, log);

auditResult = {
finalUrl: auditContext.finalUrl,
Expand Down Expand Up @@ -146,8 +133,24 @@ export default async function auditBrokenBacklinks(message, context) {
auditContext,
auditResult,
};

await sqs.sendMessage(queueUrl, data);

const baseUrl = site.getBaseURL();
const sitemaps = await obtainSitemapUrls(baseUrl, log);
if (sitemaps?.success && sitemaps?.paths) {
await enhanceBacklinksWithFixes(
siteId,
auditResult.brokenBacklinks,
Object.keys(sitemaps.paths),
{
region,
statisticsServiceArn,
log,
},
);
}

log.info(`Successfully audited ${siteId} for ${type} type audit`);
return noContent();
} catch (e) {
Expand Down
4 changes: 2 additions & 2 deletions src/sitemap/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ export async function getBaseUrlPagesFromSitemaps(baseUrl, urls) {
* @param log
* @returns {Promise<{success: boolean, reasons: Array<{value}>, paths?: any}>} result of sitemap
*/
export async function findSitemap(inputUrl, log) {
export async function obtainSitemapUrls(inputUrl, log) {
const logMessages = [];

const parsedUrl = extractDomainAndProtocol(inputUrl);
Expand Down Expand Up @@ -334,7 +334,7 @@ export async function sitemapAuditRunner(baseURL, context) {
const { log } = context;
log.info(`Received sitemap audit request for ${baseURL}`);
const startTime = process.hrtime();
const auditResult = await findSitemap(baseURL, log);
const auditResult = await obtainSitemapUrls(baseURL, log);

const endTime = process.hrtime(startTime);
const elapsedSeconds = endTime[0] + endTime[1] / 1e9;
Expand Down
106 changes: 48 additions & 58 deletions src/support/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import { hasText, prependSchema, resolveCustomerSecretsName } from '@adobe/space
import URI from 'urijs';
import { JSDOM } from 'jsdom';
import { GetSecretValueCommand, SecretsManagerClient } from '@aws-sdk/client-secrets-manager';
import { InvokeCommand, LambdaClient } from '@aws-sdk/client-lambda';

URI.preventInvalidHostname = true;

Expand Down Expand Up @@ -216,69 +217,58 @@ export const extractKeywordsFromUrl = (url, log) => {
};

/**
* Processes broken backlinks to find suggested URLs based on keywords.
*
* @param {Array} brokenBacklinks - The array of broken backlink objects to process.
* @param {Array} keywords - The array of keyword objects to match against.
* @param {Object} log - The logger object for logging messages.
* @returns {Array} A new array of backlink objects with suggested URLs added.
* Enhances the backlinks with fixes, triggers a Lambda function to calculate the fixes.
* @param siteId - The site ID.
* @param brokenBacklinks - The broken backlinks.
* @param sitemapPaths - Paths of all sitemaps of the site.
* @param config - The configuration object.
* @param config.region - The AWS region.
* @param config.statisticsService - The statistics service Lambda function name.
* @param config.log - The logger.
* @returns {Promise<{status: string}>}
*/
export const enhanceBacklinksWithFixes = (brokenBacklinks, keywords, log) => {
const result = [];

for (const backlink of brokenBacklinks) {
log.info(`trying to find redirect for: ${backlink.url_to}`);
const extractedKeywords = extractKeywordsFromUrl(backlink.url_to, log);

const matchedData = [];

// Match keywords and include rank in the matched data
keywords.forEach((entry) => {
const matchingKeyword = extractedKeywords.find(
(keywordObj) => {
const regex = new RegExp(`\\b${keywordObj.keyword}\\b`, 'i');
return regex.test(entry.keyword);
},
);
if (matchingKeyword) {
matchedData.push({ ...entry, rank: matchingKeyword.rank });
}
export async function enhanceBacklinksWithFixes(siteId, brokenBacklinks, sitemapPaths, config) {
const {
region, statisticsServiceArn, log,
} = config;
log.info(`Enhancing backlinks with fixes for site ${siteId}`);

const client = new LambdaClient({ region });

const invokeLambdaForBatch = async (batch) => {
const payload = {
type: 'broken-backlinks',
payload: {
siteId,
brokenBacklinks: batch,
sitemapPaths,
},
};

const command = new InvokeCommand({
FunctionName: statisticsServiceArn,
Payload: JSON.stringify(payload),
InvocationType: 'Event',
});

// Try again with split keywords if no matches found
if (matchedData.length === 0) {
const splitKeywords = extractedKeywords
.map((keywordObj) => keywordObj.keyword.split(' ').map((k) => ({ keyword: k, rank: keywordObj.rank })))
.flat();

splitKeywords.forEach((keywordObj) => {
keywords.forEach((entry) => {
const regex = new RegExp(`\\b${keywordObj.keyword}\\b`, 'i');
if (regex.test(entry.keyword)) {
matchedData.push({ ...entry, rank: keywordObj.rank });
}
});
});
try {
await client.send(command);
log.info(`Lambda function ${statisticsServiceArn} invoked successfully for batch.`);
} catch (error) {
log.error(`Error invoking Lambda function ${statisticsServiceArn} for batch:`, error);
}
};

// Sort by rank and then by traffic
matchedData.sort((a, b) => {
if (b.rank === a.rank) {
return b.traffic - a.traffic; // Higher traffic ranks first
}
return a.rank - b.rank; // Higher rank ranks first (1 is highest)
});
// Invoke Lambda in batches of 10
const batchSize = 10;
const promises = [];

const newBacklink = { ...backlink };
for (let i = 0; i < brokenBacklinks.length; i += batchSize) {
const batch = brokenBacklinks.slice(i, i + batchSize);
promises.push(invokeLambdaForBatch(batch));
}

if (matchedData.length > 0) {
log.info(`found ${matchedData.length} keywords for backlink ${backlink.url_to}`);
newBacklink.url_suggested = matchedData[0].url;
} else {
log.info(`could not find suggested URL for backlink ${backlink.url_to} with keywords ${extractedKeywords.map((k) => k.keyword).join(', ')}`);
}
await Promise.all(promises);

result.push(newBacklink);
}
return result;
};
return { status: `Lambda function invoked for ${promises.length} batch(es)` };
}
Loading
Loading