From d19160c33b205931718696b075cc4bc8a9de15c8 Mon Sep 17 00:00:00 2001 From: alinarublea Date: Fri, 1 Dec 2023 15:24:53 +0100 Subject: [PATCH] feat: add support for 404 audits --- src/cwv/handler.js | 31 +++----- src/index.js | 2 + src/notfound/handler.js | 73 +++++++++++++++++++ src/support/utils.js | 17 +++++ test/audits/cwv.test.js | 4 +- test/audits/notfound.test.js | 99 ++++++++++++++++++++++++++ test/notfounddata.js | 134 +++++++++++++++++++++++++++++++++++ 7 files changed, 335 insertions(+), 25 deletions(-) create mode 100644 src/notfound/handler.js create mode 100644 test/audits/notfound.test.js create mode 100644 test/notfounddata.js diff --git a/src/cwv/handler.js b/src/cwv/handler.js index e308fa89..ccb8f5c3 100644 --- a/src/cwv/handler.js +++ b/src/cwv/handler.js @@ -11,18 +11,14 @@ */ import { createUrl, Response } from '@adobe/fetch'; -import { fetch } from '../support/utils.js'; +import { + DOMAIN_LIST_URL, DOMAIN_REQUEST_DEFAULT_PARAMS, fetch, getRUMUrl, PAGEVIEW_THRESHOLD, +} from '../support/utils.js'; -export const DEFAULT_PARAMS = { - interval: 7, - offset: 0, - limit: 101, -}; - -const DOMAIN_LIST_URL = 'https://helix-pages.anywhere.run/helix-services/run-query@v3/rum-dashboard'; - -// weekly pageview threshold to eliminate urls with lack of samples -const PAGEVIEW_THRESHOLD = 7000; +export function filterRUMData(data) { + return data.pageviews > PAGEVIEW_THRESHOLD // ignore the pages with low pageviews + && data.url.toLowerCase() !== 'other'; // ignore the combined result +} /** * url param in run-query@v3/rum-dashboard works in a 'startsWith' fashion. url=domain.com returns @@ -31,17 +27,6 @@ const PAGEVIEW_THRESHOLD = 7000; * @param url * @returns finalUrl {Promise} */ -export async function getRUMUrl(url) { - const urlWithScheme = url.startsWith('http') ? url : `https://${url}`; - const resp = await fetch(urlWithScheme); - const finalUrl = resp.url.split('://')[1]; - return finalUrl.endsWith('/') ? finalUrl.slice(0, -1) : /* c8 ignore next */ finalUrl; -} - -function filterRUMData(data) { - return data.pageviews > PAGEVIEW_THRESHOLD // ignore the pages with low pageviews - && data.url.toLowerCase() !== 'other'; // ignore the combined result -} function processRUMResponse(respJson) { return respJson?.results?.data @@ -68,7 +53,7 @@ export default async function auditCWV(message, context) { auditContext.finalUrl = finalUrl; const params = { - ...DEFAULT_PARAMS, + ...DOMAIN_REQUEST_DEFAULT_PARAMS, domainkey, url: finalUrl, }; diff --git a/src/index.js b/src/index.js index de9e4c54..c5a2288e 100644 --- a/src/index.js +++ b/src/index.js @@ -15,9 +15,11 @@ import { Response } from '@adobe/fetch'; import secrets from '@adobe/helix-shared-secrets'; import sqs from './support/sqs.js'; import cwv from './cwv/handler.js'; +import notfound from './notfound/handler.js'; const HANDLERS = { cwv, + 404: notfound, }; /** diff --git a/src/notfound/handler.js b/src/notfound/handler.js new file mode 100644 index 00000000..84613a56 --- /dev/null +++ b/src/notfound/handler.js @@ -0,0 +1,73 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import { createUrl, Response } from '@adobe/fetch'; +import { + DOMAIN_LIST_URL, DOMAIN_REQUEST_DEFAULT_PARAMS, fetch, getRUMUrl, PAGEVIEW_THRESHOLD, +} from '../support/utils.js'; + +export function filter404Data(data) { + return data.views > PAGEVIEW_THRESHOLD // ignore the pages with low pageviews + && data.url.toLowerCase() !== 'other'; // ignore the combined result +} +/** + * url param in run-query@v3/rum-dashboard works in a 'startsWith' fashion. url=domain.com returns + * an empty result whereas url=www.domain.com/ returns the desired result. To catch the redirects + * to subdomains we issue a GET call to the domain, then use the final url after redirects + * @param url + * @returns finalUrl {Promise} + */ + +function process404Response(respJson) { + return respJson?.results?.data + .filter(filter404Data) + .map((row) => ({ + url: row.url, + pageviews: row.views, + })); +} +export default async function audit404(message, context) { + const { type, url, auditContext } = message; + const { log, sqs } = context; + const { + AUDIT_RESULTS_QUEUE_URL: queueUrl, + RUM_DOMAIN_KEY: domainkey, + } = context.env; + + log.info(`Received audit req for domain: ${url}`); + + const finalUrl = await getRUMUrl(url); + auditContext.finalUrl = finalUrl; + + const params = { + ...DOMAIN_REQUEST_DEFAULT_PARAMS, + domainkey, + url: finalUrl, + checkpoint: 404, + }; + + const resp = await fetch(createUrl(DOMAIN_LIST_URL, params)); + const respJson = await resp.json(); + + const auditResult = process404Response(respJson); + + await sqs.sendMessage(queueUrl, { + type, + url, + auditContext, + auditResult, + }); + + log.info(`Successfully audited ${url} for ${type} type audit`); + + return new Response(''); +} diff --git a/src/support/utils.js b/src/support/utils.js index 34e73318..9e058b45 100644 --- a/src/support/utils.js +++ b/src/support/utils.js @@ -11,7 +11,24 @@ */ import { context as h2, h1 } from '@adobe/fetch'; +export const PAGEVIEW_THRESHOLD = 7000; +export const DOMAIN_LIST_URL = 'https://helix-pages.anywhere.run/helix-services/run-query@v3/rum-dashboard'; + +export const DOMAIN_REQUEST_DEFAULT_PARAMS = { + interval: 7, + offset: 0, + limit: 101, +}; /* c8 ignore next 3 */ export const { fetch } = process.env.HELIX_FETCH_FORCE_HTTP1 ? h1() : h2(); + +// weekly pageview threshold to eliminate urls with lack of samples + +export async function getRUMUrl(url) { + const urlWithScheme = url.startsWith('http') ? url : `https://${url}`; + const resp = await fetch(urlWithScheme); + const finalUrl = resp.url.split('://')[1]; + return finalUrl.endsWith('/') ? finalUrl.slice(0, -1) : /* c8 ignore next */ finalUrl; +} diff --git a/test/audits/cwv.test.js b/test/audits/cwv.test.js index 351db037..a4a04cf5 100644 --- a/test/audits/cwv.test.js +++ b/test/audits/cwv.test.js @@ -19,7 +19,7 @@ import sinonChai from 'sinon-chai'; import { Request } from '@adobe/fetch'; import nock from 'nock'; import { main } from '../../src/index.js'; -import { DEFAULT_PARAMS, getRUMUrl } from '../../src/cwv/handler.js'; +import { DOMAIN_REQUEST_DEFAULT_PARAMS, getRUMUrl } from '../../src/support/utils.js'; import { expectedAuditResult, rumData } from '../rum-data.js'; chai.use(sinonChai); @@ -68,7 +68,7 @@ describe('Index Tests', () => { nock('https://helix-pages.anywhere.run') .get('/helix-services/run-query@v3/rum-dashboard') .query({ - ...DEFAULT_PARAMS, + ...DOMAIN_REQUEST_DEFAULT_PARAMS, domainkey: context.env.RUM_DOMAIN_KEY, url: 'adobe.com', }) diff --git a/test/audits/notfound.test.js b/test/audits/notfound.test.js new file mode 100644 index 00000000..0f62a80a --- /dev/null +++ b/test/audits/notfound.test.js @@ -0,0 +1,99 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +/* eslint-env mocha */ +/* eslint-disable no-unused-expressions */ // expect statements + +import chai from 'chai'; +import sinon from 'sinon'; +import sinonChai from 'sinon-chai'; +import { Request } from '@adobe/fetch'; +import nock from 'nock'; +import { main } from '../../src/index.js'; +import { DOMAIN_REQUEST_DEFAULT_PARAMS, getRUMUrl } from '../../src/support/utils.js'; +import { notFoundData, expectedAuditResult } from '../notfounddata.js'; + +chai.use(sinonChai); +const { expect } = chai; + +const sandbox = sinon.createSandbox(); +describe('Index Tests', () => { + const request = new Request('https://space.cat'); + let context; + let messageBodyJson; + + beforeEach('setup', () => { + messageBodyJson = { + type: '404', + url: 'adobe.com', + auditContext: { + finalUrl: 'adobe.com', + }, + }; + context = { + log: console, + runtime: { + region: 'us-east-1', + }, + env: { + AUDIT_RESULTS_QUEUE_URL: 'queueUrl', + RUM_DOMAIN_KEY: 'domainkey', + }, + invocation: { + event: { + Records: [{ + body: JSON.stringify(messageBodyJson), + }], + }, + }, + sqs: { + sendMessage: sandbox.stub().resolves(), + }, + }; + }); + + it('fetch cwv for base url > process > send results', async () => { + nock('https://adobe.com') + .get('/') + .reply(200); + nock('https://helix-pages.anywhere.run') + .get('/helix-services/run-query@v3/rum-dashboard') + .query({ + ...DOMAIN_REQUEST_DEFAULT_PARAMS, + domainkey: context.env.RUM_DOMAIN_KEY, + checkpoint: 404, + url: 'adobe.com', + }) + .reply(200, notFoundData); + + const resp = await main(request, context); + + const expectedMessage = { + ...messageBodyJson, + auditResult: expectedAuditResult, + }; + + expect(resp.status).to.equal(200); + expect(context.sqs.sendMessage).to.have.been.calledOnce; + expect(context.sqs.sendMessage).to.have.been + .calledWith(context.env.AUDIT_RESULTS_QUEUE_URL, expectedMessage); + }); + + it('getRUMUrl do not add scheme to urls with a scheme already', async () => { + nock('http://space.cat') + .get('/') + .reply(200); + + const finalUrl = await getRUMUrl('http://space.cat'); + expect(finalUrl).to.eql('space.cat'); + }); +}); diff --git a/test/notfounddata.js b/test/notfounddata.js new file mode 100644 index 00000000..fd691fb6 --- /dev/null +++ b/test/notfounddata.js @@ -0,0 +1,134 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +export const expectedAuditResult = [ + { + url: 'https://www.adobe.com/kr/acrobat/hub/how-to/how-to-convert-pdf-to-image.html', + pageviews: '10000', + }, + { + url: 'https://www.adobe.com/sea/acrobat/online/merge-pdf.html', + pageviews: '8000', + }, +]; +export const notFoundData = { + ':names': [ + 'results', + 'meta', + ], + ':type': 'multi-sheet', + ':version': 3, + results: { + limit: 30, + offset: 0, + total: 30, + data: [ + { + ids: 7, + views: '10000', + actions: '10000', + url: 'https://www.adobe.com/kr/acrobat/hub/how-to/how-to-convert-pdf-to-image.html', + checkpoint: '404', + source: 'https://post.naver.com/viewer/postView.naver', + actions_per_view: '1', + }, + { + ids: 4, + views: '8000', + actions: '8000', + url: 'https://www.adobe.com/sea/acrobat/online/merge-pdf.html', + checkpoint: '404', + source: 'https://brandinside.asia/', + actions_per_view: '1', + }, + { + ids: 4, + views: '700', + actions: '400', + url: 'https://www.adobe.com/sea/acrobat/online/pdf-to-word.html', + checkpoint: '404', + source: '', + actions_per_view: '1', + }, + ], + columns: [ + 'ids', + 'views', + 'actions', + 'url', + 'checkpoint', + 'source', + 'actions_per_view', + ], + }, + meta: { + limit: 10, + offset: 0, + total: 10, + columns: [ + 'name', + 'value', + 'type', + ], + data: [ + { + name: 'description', + value: 'Get popularity data for RUM target attribute values, filtered by checkpoint', + type: 'query description', + }, + { + name: 'limit', + value: 30, + type: 'request parameter', + }, + { + name: 'interval', + value: '30', + type: 'request parameter', + }, + { + name: 'offset', + value: '0', + type: 'request parameter', + }, + { + name: 'startdate', + value: '2022-01-01', + type: 'request parameter', + }, + { + name: 'enddate', + value: '2022-01-31', + type: 'request parameter', + }, + { + name: 'timezone', + value: 'UTC', + type: 'request parameter', + }, + { + name: 'url', + value: 'www.adobe.com', + type: 'request parameter', + }, + { + name: 'checkpoint', + value: 404, + type: 'request parameter', + }, + { + name: 'source', + value: '-', + type: 'request parameter', + }, + ], + }, +};