From e9f29c617dbedc73eebc538d6d0e01693f9ab822 Mon Sep 17 00:00:00 2001 From: Maxim Tsoy Date: Tue, 26 Nov 2024 09:49:14 +0100 Subject: [PATCH 01/10] Update autoconsent to 12.0.0 --- package-lock.json | 158 +++++++++++++++++++++++++++++++++++++++++++--- package.json | 2 +- 2 files changed, 150 insertions(+), 10 deletions(-) diff --git a/package-lock.json b/package-lock.json index 7f068427..b5c065fe 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "1.0.0", "license": "Apache 2.0", "dependencies": { - "@duckduckgo/autoconsent": "^10.15.0", + "@duckduckgo/autoconsent": "^12.0.0", "async": "^2.6.1", "chalk": "^2.4.1", "clickhouse": "^2.6.0", @@ -40,11 +40,14 @@ } }, "node_modules/@duckduckgo/autoconsent": { - "version": "10.15.0", - "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-10.15.0.tgz", - "integrity": "sha512-Jxaogy2IuZEEV1+xPyo3c3PnZJmBO6ima/MapF2VolI/IKxXnL+9yYqyydPhSk0ahx42YINA6uIK6zexlKDIkQ==", + "version": "12.0.0", + "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-12.0.0.tgz", + "integrity": "sha512-ObPv0pE1d8G1Nnj9NtTxvu04mNRnJ8o8cpU1G0YJk1kP2ykipGnlp9wrn9Tkn2fDy5zT8HQqDsOYWEyk3ES1kg==", + "license": "MPL-2.0", "dependencies": { - "tldts-experimental": "^6.1.37" + "@ghostery/adblocker": "^2.0.4", + "@ghostery/adblocker-content": "^2.0.4", + "tldts-experimental": "^6.1.41" } }, "node_modules/@eslint-community/eslint-utils": { @@ -115,6 +118,35 @@ "node": "^12.22.0 || ^14.17.0 || >=16.0.0" } }, + "node_modules/@ghostery/adblocker": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker/-/adblocker-2.1.1.tgz", + "integrity": "sha512-FL4yWrpNTCmtbAfeLotUoo94ZyNqHdZpZRo4Qlk0guPzDGcOtW4/c84UzS9D/Z9Z4H3nWSCrW0q38pjwAbDykA==", + "license": "MPL-2.0", + "dependencies": { + "@ghostery/adblocker-content": "^2.1.1", + "@ghostery/adblocker-extended-selectors": "^2.1.1", + "@remusao/guess-url-type": "^1.3.0", + "@remusao/small": "^1.2.1", + "@remusao/smaz": "^1.9.1", + "tldts-experimental": "^6.0.14" + } + }, + "node_modules/@ghostery/adblocker-content": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker-content/-/adblocker-content-2.1.1.tgz", + "integrity": "sha512-1DKHmPnlQleXapaL36xZOwwZmpdbjMP/IcWdTTzyriyCDIFlSwBDT1DJ3xg0TK61ahZMEwz1MnTGM6X99z/5rQ==", + "license": "MPL-2.0", + "dependencies": { + "@ghostery/adblocker-extended-selectors": "^2.1.1" + } + }, + "node_modules/@ghostery/adblocker-extended-selectors": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker-extended-selectors/-/adblocker-extended-selectors-2.1.1.tgz", + "integrity": "sha512-jEHjU2CarS2MtRYfm/6iTKMS1DVzepuwXSMKg1zTyHl+u4ZKvKNYFK7plD0nUlL5a8akyRkYwLheXnKsW3nChQ==", + "license": "MPL-2.0" + }, "node_modules/@humanwhocodes/config-array": { "version": "0.11.14", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.14.tgz", @@ -185,6 +217,49 @@ "node": ">= 8" } }, + "node_modules/@remusao/guess-url-type": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@remusao/guess-url-type/-/guess-url-type-1.3.0.tgz", + "integrity": "sha512-SNSJGxH5ckvxb3EUHj4DqlAm/bxNxNv2kx/AESZva/9VfcBokwKNS+C4D1lQdWIDM1R3d3UG+xmVzlkNG8CPTQ==", + "license": "MPL-2.0" + }, + "node_modules/@remusao/small": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@remusao/small/-/small-1.3.0.tgz", + "integrity": "sha512-bydAhJI+ywmg5xMUcbqoR8KahetcfkFywEZpsyFZ8EBofilvWxbXnMSe4vnjDI1Y+SWxnNhR4AL/2BAXkf4b8A==", + "license": "MPL-2.0" + }, + "node_modules/@remusao/smaz": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz/-/smaz-1.10.0.tgz", + "integrity": "sha512-GQzCxmmMpLkyZwcwNgz8TpuBEWl0RUQa8IcvKiYlPxuyYKqyqPkCr0hlHI15ckn3kDUPS68VmTVgyPnLNrdVmg==", + "license": "MPL-2.0", + "dependencies": { + "@remusao/smaz-compress": "^1.10.0", + "@remusao/smaz-decompress": "^1.10.0" + } + }, + "node_modules/@remusao/smaz-compress": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz-compress/-/smaz-compress-1.10.0.tgz", + "integrity": "sha512-E/lC8OSU+3bQrUl64vlLyPzIxo7dxF2RvNBe9KzcM4ax43J/d+YMinmMztHyCIHqRbz7rBCtkp3c0KfeIbHmEg==", + "license": "MPL-2.0", + "dependencies": { + "@remusao/trie": "^1.5.0" + } + }, + "node_modules/@remusao/smaz-decompress": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz-decompress/-/smaz-decompress-1.10.0.tgz", + "integrity": "sha512-aA5ImUH480Pcs5/cOgToKmFnzi7osSNG6ft+7DdmQTaQEEst3nLq3JLlBEk+gwidURymjbx6DYs60LHaZ415VQ==", + "license": "MPL-2.0" + }, + "node_modules/@remusao/trie": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@remusao/trie/-/trie-1.5.0.tgz", + "integrity": "sha512-UX+3utJKgwCsg6sUozjxd38gNMVRXrY4TNX9VvCdSrlZBS1nZjRPi98ON3QjRAdf6KCguJFyQARRsulTeqQiPg==", + "license": "MPL-2.0" + }, "node_modules/@types/async": { "version": "2.4.2", "resolved": "https://registry.npmjs.org/@types/async/-/async-2.4.2.tgz", @@ -3367,11 +3442,13 @@ }, "dependencies": { "@duckduckgo/autoconsent": { - "version": "10.15.0", - "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-10.15.0.tgz", - "integrity": "sha512-Jxaogy2IuZEEV1+xPyo3c3PnZJmBO6ima/MapF2VolI/IKxXnL+9yYqyydPhSk0ahx42YINA6uIK6zexlKDIkQ==", + "version": "12.0.0", + "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-12.0.0.tgz", + "integrity": "sha512-ObPv0pE1d8G1Nnj9NtTxvu04mNRnJ8o8cpU1G0YJk1kP2ykipGnlp9wrn9Tkn2fDy5zT8HQqDsOYWEyk3ES1kg==", "requires": { - "tldts-experimental": "^6.1.37" + "@ghostery/adblocker": "^2.0.4", + "@ghostery/adblocker-content": "^2.0.4", + "tldts-experimental": "^6.1.41" } }, "@eslint-community/eslint-utils": { @@ -3420,6 +3497,32 @@ "integrity": "sha512-Ys+3g2TaW7gADOJzPt83SJtCDhMjndcDMFVQ/Tj9iA1BfJzFKD9mAUXT3OenpuPHbI6P/myECxRJrofUsDx/5g==", "dev": true }, + "@ghostery/adblocker": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker/-/adblocker-2.1.1.tgz", + "integrity": "sha512-FL4yWrpNTCmtbAfeLotUoo94ZyNqHdZpZRo4Qlk0guPzDGcOtW4/c84UzS9D/Z9Z4H3nWSCrW0q38pjwAbDykA==", + "requires": { + "@ghostery/adblocker-content": "^2.1.1", + "@ghostery/adblocker-extended-selectors": "^2.1.1", + "@remusao/guess-url-type": "^1.3.0", + "@remusao/small": "^1.2.1", + "@remusao/smaz": "^1.9.1", + "tldts-experimental": "^6.0.14" + } + }, + "@ghostery/adblocker-content": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker-content/-/adblocker-content-2.1.1.tgz", + "integrity": "sha512-1DKHmPnlQleXapaL36xZOwwZmpdbjMP/IcWdTTzyriyCDIFlSwBDT1DJ3xg0TK61ahZMEwz1MnTGM6X99z/5rQ==", + "requires": { + "@ghostery/adblocker-extended-selectors": "^2.1.1" + } + }, + "@ghostery/adblocker-extended-selectors": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker-extended-selectors/-/adblocker-extended-selectors-2.1.1.tgz", + "integrity": "sha512-jEHjU2CarS2MtRYfm/6iTKMS1DVzepuwXSMKg1zTyHl+u4ZKvKNYFK7plD0nUlL5a8akyRkYwLheXnKsW3nChQ==" + }, "@humanwhocodes/config-array": { "version": "0.11.14", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.14.tgz", @@ -3469,6 +3572,43 @@ "fastq": "^1.6.0" } }, + "@remusao/guess-url-type": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@remusao/guess-url-type/-/guess-url-type-1.3.0.tgz", + "integrity": "sha512-SNSJGxH5ckvxb3EUHj4DqlAm/bxNxNv2kx/AESZva/9VfcBokwKNS+C4D1lQdWIDM1R3d3UG+xmVzlkNG8CPTQ==" + }, + "@remusao/small": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@remusao/small/-/small-1.3.0.tgz", + "integrity": "sha512-bydAhJI+ywmg5xMUcbqoR8KahetcfkFywEZpsyFZ8EBofilvWxbXnMSe4vnjDI1Y+SWxnNhR4AL/2BAXkf4b8A==" + }, + "@remusao/smaz": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz/-/smaz-1.10.0.tgz", + "integrity": "sha512-GQzCxmmMpLkyZwcwNgz8TpuBEWl0RUQa8IcvKiYlPxuyYKqyqPkCr0hlHI15ckn3kDUPS68VmTVgyPnLNrdVmg==", + "requires": { + "@remusao/smaz-compress": "^1.10.0", + "@remusao/smaz-decompress": "^1.10.0" + } + }, + "@remusao/smaz-compress": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz-compress/-/smaz-compress-1.10.0.tgz", + "integrity": "sha512-E/lC8OSU+3bQrUl64vlLyPzIxo7dxF2RvNBe9KzcM4ax43J/d+YMinmMztHyCIHqRbz7rBCtkp3c0KfeIbHmEg==", + "requires": { + "@remusao/trie": "^1.5.0" + } + }, + "@remusao/smaz-decompress": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz-decompress/-/smaz-decompress-1.10.0.tgz", + "integrity": "sha512-aA5ImUH480Pcs5/cOgToKmFnzi7osSNG6ft+7DdmQTaQEEst3nLq3JLlBEk+gwidURymjbx6DYs60LHaZ415VQ==" + }, + "@remusao/trie": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@remusao/trie/-/trie-1.5.0.tgz", + "integrity": "sha512-UX+3utJKgwCsg6sUozjxd38gNMVRXrY4TNX9VvCdSrlZBS1nZjRPi98ON3QjRAdf6KCguJFyQARRsulTeqQiPg==" + }, "@types/async": { "version": "2.4.2", "resolved": "https://registry.npmjs.org/@types/async/-/async-2.4.2.tgz", diff --git a/package.json b/package.json index 94064e82..be7afaab 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,7 @@ "typescript": "^4.6.4" }, "dependencies": { - "@duckduckgo/autoconsent": "^10.15.0", + "@duckduckgo/autoconsent": "^12.0.0", "async": "^2.6.1", "chalk": "^2.4.1", "clickhouse": "^2.6.0", From 5afdd7c71ac994f01a31b24e8f85212341ca0c87 Mon Sep 17 00:00:00 2001 From: Maxim Tsoy Date: Thu, 28 Nov 2024 21:32:41 +0100 Subject: [PATCH 02/10] CMPCollector relies on heuristics provided by autoconsent --- collectors/CMPCollector.js | 131 +++++++++----------------------- reporters/ClickhouseReporter.js | 39 +++++++--- 2 files changed, 61 insertions(+), 109 deletions(-) diff --git a/collectors/CMPCollector.js b/collectors/CMPCollector.js index 611482c8..f9b40b39 100644 --- a/collectors/CMPCollector.js +++ b/collectors/CMPCollector.js @@ -15,12 +15,12 @@ const BaseCollector = require('./BaseCollector'); * @typedef { import('@duckduckgo/autoconsent/lib/messages').OptOutResultMessage } OptOutResultMessage * @typedef { import('@duckduckgo/autoconsent/lib/messages').OptInResultMessage } OptInResultMessage * @typedef { import('@duckduckgo/autoconsent/lib/messages').DoneMessage } DoneMessage - * @typedef { { snippets: string[], patterns: string[] } } ScanResult + * @typedef { { snippets: Set, patterns: Set, filterListMatched: boolean } } ScanResult */ // @ts-ignore const baseContentScript = fs.readFileSync( - require.resolve('@duckduckgo/autoconsent/dist/autoconsent.playwright.js'), + require.resolve('../node_modules/@duckduckgo/autoconsent/dist/autoconsent.playwright.js'), 'utf8' ); @@ -46,32 +46,6 @@ function isIgnoredEvalError(e) { ); } -// TODO: check for false positive detections per pattern -const DETECT_PATTERNS = [ - /accept cookies/ig, - /accept all/ig, - /reject all/ig, - /only necessary cookies/ig, // "only necessary" is probably too broad - /by clicking.*(accept|agree|allow)/ig, - /by continuing/ig, - /we (use|serve)( optional)? cookies/ig, - /we are using cookies/ig, - /use of cookies/ig, - /(this|our) (web)?site.*cookies/ig, - /cookies (and|or) .* technologies/ig, - /such as cookies/ig, - /read more about.*cookies/ig, - /consent to.*cookies/ig, - /we and our partners.*cookies/ig, - /we.*store.*information.*such as.*cookies/ig, - /store and\/or access information.*on a device/ig, - /personalised ads and content, ad and content measurement/ig, - - // it might be tempting to add the patterns below, but they cause too many false positives. Don't do it :) - // /cookies? settings/i, - // /cookies? preferences/i, -]; - class CMPCollector extends BaseCollector { id() { return 'cmps'; @@ -88,12 +62,12 @@ class CMPCollector extends BaseCollector { this.receivedMsgs = []; this.selfTestFrame = null; this.isolated2pageworld = new Map(); - this.pendingScan = createDeferred(); this.context = options.context; /** @type {ScanResult} */ this.scanResult = { - snippets: [], - patterns: [], + snippets: new Set([]), + patterns: new Set([]), + filterListMatched: false, }; } @@ -189,10 +163,12 @@ class CMPCollector extends BaseCollector { /** @type {Partial} */ const autoconsentConfig = { enabled: true, - autoAction: null, // we request action explicitly later + autoAction: 'optOut', disabledCmps: [], enablePrehide: false, enableCosmeticRules: true, + enableFilterList: true, + enableHeuristicDetection: true, detectRetries: 20, isMainWorld: false }; @@ -203,14 +179,17 @@ class CMPCollector extends BaseCollector { break; } case 'popupFound': - if (this.autoAction) { - await this.pendingScan.promise; // wait for the pattern detection first - await this._cdpClient.send('Runtime.evaluate', { - expression: `autoconsentReceiveMessage({ type: "${this.autoAction}" })`, - contextId: executionContextId, - }); + console.log('popupFound', msg); + if (msg.cmp === 'filterList') { + this.scanResult.filterListMatched = true; } break; + case 'report': + msg.state.heuristicPatterns.forEach(x => this.scanResult.patterns.add(x)); + msg.state.heuristicSnippets.forEach(x => this.scanResult.snippets.add(x)); + // console.log('report', msg.state); + console.log('scanResult', this.scanResult); + break; case 'optInResult': case 'optOutResult': { if (msg.scheduleSelfTest) { @@ -315,44 +294,6 @@ class CMPCollector extends BaseCollector { } } - async postLoad() { - /** - * @type {string[]} - */ - const foundPatterns = []; - const foundSnippets = []; - const pages = await this.context.pages(); - if (pages.length > 0) { - const page = pages[0]; - /** - * @type {Promise[]} - */ - const promises = []; - page.frames().forEach(frame => { - // eslint-disable-next-line no-undef - promises.push(frame.evaluate(() => document.documentElement.innerText).catch(reason => { - this.log(`error retrieving text: ${reason}`); - // ignore exceptions - return ''; - })); - }); - const texts = await Promise.all(promises); - const allTexts = texts.join('\n'); - for (const p of DETECT_PATTERNS) { - const matches = allTexts.match(p); - if (matches) { - foundPatterns.push(p.toString()); - foundSnippets.push(...matches.map(m => m.substring(0, 200))); - } - } - } - this.pendingScan.resolve(); - this.scanResult = { - patterns: foundPatterns, - snippets: Array.from(new Set(foundSnippets)), - }; - } - /** * @returns {CMPResult[]} */ @@ -394,8 +335,9 @@ class CMPCollector extends BaseCollector { succeeded: false, selfTestFail: Boolean(selfTestResult && !selfTestResult.result), errors, - patterns: [], - snippets: [], + patterns: Array.from(this.scanResult.patterns), + snippets: Array.from(this.scanResult.snippets), + filterListMatched: this.scanResult.filterListMatched, }; const found = this.findMessage({type: 'popupFound', cmp: msg.cmp}); @@ -427,25 +369,19 @@ class CMPCollector extends BaseCollector { async getData() { await this.waitForFinish(); const results = this.collectResults(); - if (this.scanResult.patterns.length > 0) { - if (results.length > 0) { - results.forEach(r => { - r.patterns = this.scanResult.patterns; - r.snippets = this.scanResult.snippets; - }); - } else { - results.push({ - final: false, - name: '', - open: false, - started: false, - succeeded: false, - selfTestFail: false, - errors: [], - patterns: this.scanResult.patterns, - snippets: this.scanResult.snippets, - }); - } + if (results.length === 0) { + results.push({ + final: false, + name: '', + open: false, + started: false, + succeeded: false, + selfTestFail: false, + errors: [], + patterns: Array.from(this.scanResult.patterns), + snippets: Array.from(this.scanResult.snippets), + filterListMatched: this.scanResult.filterListMatched, + }); } return results; } @@ -462,6 +398,7 @@ class CMPCollector extends BaseCollector { * @property {string[]} errors * @property {string[]} patterns * @property {string[]} snippets + * @property {boolean} filterListMatched */ module.exports = CMPCollector; \ No newline at end of file diff --git a/reporters/ClickhouseReporter.js b/reporters/ClickhouseReporter.js index 7305ce3b..699ae23f 100644 --- a/reporters/ClickhouseReporter.js +++ b/reporters/ClickhouseReporter.js @@ -6,8 +6,9 @@ const {createUniqueUrlName} = require('../helpers/hash'); // eslint-disable-next-line no-process-env const CLICKHOUSE_SERVER = process.env.CLICKHOUSE_SERVER || 'va-clickhouse1'; const DB = 'tracker_radar_crawls'; +const CLUSTER = 'ch-prod-cluster'; const TABLE_DEFINITIONS = [ - `CREATE TABLE IF NOT EXISTS ${DB}.crawls ( + `CREATE TABLE IF NOT EXISTS ${DB}.crawls ON CLUSTER '${CLUSTER}' ( crawlId String, name String, region String, @@ -16,7 +17,7 @@ const TABLE_DEFINITIONS = [ ENGINE = MergeTree() PRIMARY KEY(crawlId) ORDER BY crawlId`, - `CREATE TABLE IF NOT EXISTS ${DB}.pages ( + `CREATE TABLE IF NOT EXISTS ${DB}.pages ON CLUSTER '${CLUSTER}' ( crawlId String, pageId String, testStarted DateTime64(3, 'UTC'), @@ -26,7 +27,7 @@ const TABLE_DEFINITIONS = [ timeout UInt8 ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId)`, - `CREATE TABLE IF NOT EXISTS ${DB}.requests ( + `CREATE TABLE IF NOT EXISTS ${DB}.requests ON CLUSTER '${CLUSTER}' ( crawlId String, pageId String, requestId UInt32, @@ -45,14 +46,14 @@ const TABLE_DEFINITIONS = [ time DOUBLE NULL ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId, requestId)`, - `CREATE TABLE IF NOT EXISTS ${DB}.elements ( + `CREATE TABLE IF NOT EXISTS ${DB}.elements ON CLUSTER '${CLUSTER}' ( crawlId String, pageId String, present Array(String), visible Array(String) ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId)`, - `CREATE TABLE IF NOT EXISTS ${DB}.cmps ( + `CREATE TABLE IF NOT EXISTS ${DB}.cmps ON CLUSTER '${CLUSTER}' ( crawlId String, pageId String, name String, @@ -63,10 +64,11 @@ const TABLE_DEFINITIONS = [ selfTestFail UInt8, errors Array(String), patterns Array(String), - snippets Array(String) + snippets Array(String), + filterListMatched Bool ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId, name)`, - `CREATE TABLE IF NOT EXISTS ${DB}.apiSavedCalls ( + `CREATE TABLE IF NOT EXISTS ${DB}.apiSavedCalls ON CLUSTER '${CLUSTER}' ( crawlId String, pageId String, callId UInt32, @@ -75,21 +77,21 @@ const TABLE_DEFINITIONS = [ arguments Array(String) ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId, callId)`, - `CREATE TABLE IF NOT EXISTS ${DB}.apiCallStats ( + `CREATE TABLE IF NOT EXISTS ${DB}.apiCallStats ON CLUSTER '${CLUSTER}' ( crawlId String, pageId String, source String, stats String ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId, source)`, - `CREATE TABLE IF NOT EXISTS ${DB}.cookies ( + `CREATE TABLE IF NOT EXISTS ${DB}.cookies ON CLUSTER '${CLUSTER}' ( crawlId String, pageId String, cookieId UInt32, cookie String ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId, cookieId)`, - `CREATE TABLE IF NOT EXISTS ${DB}.targets ( + `CREATE TABLE IF NOT EXISTS ${DB}.targets ON CLUSTER '${CLUSTER}' ( crawlId String, pageId String, targetId UInt32, @@ -115,7 +117,7 @@ class ClickhouseReporter extends BaseReporter { } /** - * @param {{verbose: boolean, startTime: Date, urls: number, logPath: string}} options + * @param {{verbose: boolean, startTime: Date, urls: number, logPath: string}} options */ init(options) { this.verbose = options.verbose; @@ -194,7 +196,20 @@ class ClickhouseReporter extends BaseReporter { this.queue.elements.push([this.crawlId, pageId, data.data.elements.present, data.data.elements.visible]); } if (data.data.cmps) { - const cmpRows = data.data.cmps.map(c => [this.crawlId, pageId, c.name, c.final, c.open, c.started, c.succeeded, c.selfTestFail, c.errors, c.patterns || [], c.snippets || []]); + const cmpRows = data.data.cmps.map(c => [ + this.crawlId, + pageId, + c.name, + c.final, + c.open, + c.started, + c.succeeded, + c.selfTestFail, + c.errors, + c.patterns || [], + c.snippets || [], + c.filterListMatched || false, + ]); this.queue.cmps = this.queue.cmps.concat(cmpRows); } if (data.data.apis) { From 181b8013d4fcb7f57570115bdc8c73e6e47233a7 Mon Sep 17 00:00:00 2001 From: Maxim Tsoy Date: Thu, 28 Nov 2024 21:42:38 +0100 Subject: [PATCH 03/10] Remove debug logs --- collectors/CMPCollector.js | 3 --- 1 file changed, 3 deletions(-) diff --git a/collectors/CMPCollector.js b/collectors/CMPCollector.js index f9b40b39..e3001e8f 100644 --- a/collectors/CMPCollector.js +++ b/collectors/CMPCollector.js @@ -179,7 +179,6 @@ class CMPCollector extends BaseCollector { break; } case 'popupFound': - console.log('popupFound', msg); if (msg.cmp === 'filterList') { this.scanResult.filterListMatched = true; } @@ -187,8 +186,6 @@ class CMPCollector extends BaseCollector { case 'report': msg.state.heuristicPatterns.forEach(x => this.scanResult.patterns.add(x)); msg.state.heuristicSnippets.forEach(x => this.scanResult.snippets.add(x)); - // console.log('report', msg.state); - console.log('scanResult', this.scanResult); break; case 'optInResult': case 'optOutResult': { From ceee1e234ab807ad1199d6b618478093e048b19e Mon Sep 17 00:00:00 2001 From: Maxim Tsoy Date: Thu, 28 Nov 2024 23:57:09 +0100 Subject: [PATCH 04/10] Revert the cluster changes in clickhouse --- reporters/ClickhouseReporter.js | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/reporters/ClickhouseReporter.js b/reporters/ClickhouseReporter.js index 699ae23f..3a4cc90c 100644 --- a/reporters/ClickhouseReporter.js +++ b/reporters/ClickhouseReporter.js @@ -6,9 +6,8 @@ const {createUniqueUrlName} = require('../helpers/hash'); // eslint-disable-next-line no-process-env const CLICKHOUSE_SERVER = process.env.CLICKHOUSE_SERVER || 'va-clickhouse1'; const DB = 'tracker_radar_crawls'; -const CLUSTER = 'ch-prod-cluster'; const TABLE_DEFINITIONS = [ - `CREATE TABLE IF NOT EXISTS ${DB}.crawls ON CLUSTER '${CLUSTER}' ( + `CREATE TABLE IF NOT EXISTS ${DB}.crawls ( crawlId String, name String, region String, @@ -17,7 +16,7 @@ const TABLE_DEFINITIONS = [ ENGINE = MergeTree() PRIMARY KEY(crawlId) ORDER BY crawlId`, - `CREATE TABLE IF NOT EXISTS ${DB}.pages ON CLUSTER '${CLUSTER}' ( + `CREATE TABLE IF NOT EXISTS ${DB}.pages ( crawlId String, pageId String, testStarted DateTime64(3, 'UTC'), @@ -27,7 +26,7 @@ const TABLE_DEFINITIONS = [ timeout UInt8 ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId)`, - `CREATE TABLE IF NOT EXISTS ${DB}.requests ON CLUSTER '${CLUSTER}' ( + `CREATE TABLE IF NOT EXISTS ${DB}.requests ( crawlId String, pageId String, requestId UInt32, @@ -46,14 +45,14 @@ const TABLE_DEFINITIONS = [ time DOUBLE NULL ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId, requestId)`, - `CREATE TABLE IF NOT EXISTS ${DB}.elements ON CLUSTER '${CLUSTER}' ( + `CREATE TABLE IF NOT EXISTS ${DB}.elements ( crawlId String, pageId String, present Array(String), visible Array(String) ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId)`, - `CREATE TABLE IF NOT EXISTS ${DB}.cmps ON CLUSTER '${CLUSTER}' ( + `CREATE TABLE IF NOT EXISTS ${DB}.cmps ( crawlId String, pageId String, name String, @@ -68,7 +67,7 @@ const TABLE_DEFINITIONS = [ filterListMatched Bool ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId, name)`, - `CREATE TABLE IF NOT EXISTS ${DB}.apiSavedCalls ON CLUSTER '${CLUSTER}' ( + `CREATE TABLE IF NOT EXISTS ${DB}.apiSavedCalls ( crawlId String, pageId String, callId UInt32, @@ -77,21 +76,21 @@ const TABLE_DEFINITIONS = [ arguments Array(String) ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId, callId)`, - `CREATE TABLE IF NOT EXISTS ${DB}.apiCallStats ON CLUSTER '${CLUSTER}' ( + `CREATE TABLE IF NOT EXISTS ${DB}.apiCallStats ( crawlId String, pageId String, source String, stats String ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId, source)`, - `CREATE TABLE IF NOT EXISTS ${DB}.cookies ON CLUSTER '${CLUSTER}' ( + `CREATE TABLE IF NOT EXISTS ${DB}.cookies ( crawlId String, pageId String, cookieId UInt32, cookie String ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId, cookieId)`, - `CREATE TABLE IF NOT EXISTS ${DB}.targets ON CLUSTER '${CLUSTER}' ( + `CREATE TABLE IF NOT EXISTS ${DB}.targets ( crawlId String, pageId String, targetId UInt32, From cd79a2761830cb765663acf1a60aa8490c725123 Mon Sep 17 00:00:00 2001 From: Maxim Tsoy Date: Fri, 29 Nov 2024 00:39:35 +0100 Subject: [PATCH 05/10] Do not produce completely empty cmp results --- collectors/CMPCollector.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectors/CMPCollector.js b/collectors/CMPCollector.js index e3001e8f..1e7202d6 100644 --- a/collectors/CMPCollector.js +++ b/collectors/CMPCollector.js @@ -366,7 +366,7 @@ class CMPCollector extends BaseCollector { async getData() { await this.waitForFinish(); const results = this.collectResults(); - if (results.length === 0) { + if (this.scanResult.patterns.size > 0 && results.length === 0) { results.push({ final: false, name: '', From 7ac287ef3b66e02754886f0f5923ab1bf0397e59 Mon Sep 17 00:00:00 2001 From: Maxim Tsoy Date: Fri, 29 Nov 2024 12:28:04 +0100 Subject: [PATCH 06/10] Move a log message --- reporters/ClickhouseReporter.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/reporters/ClickhouseReporter.js b/reporters/ClickhouseReporter.js index 3a4cc90c..22fbf6e7 100644 --- a/reporters/ClickhouseReporter.js +++ b/reporters/ClickhouseReporter.js @@ -123,9 +123,6 @@ class ClickhouseReporter extends BaseReporter { this.client = new ClickHouse({url: CLICKHOUSE_SERVER}); this.crawlId = `${new Date().toISOString()}-${os.hostname()}`; this.ready = Promise.all(TABLE_DEFINITIONS.map(stmt => this.client.query(stmt).toPromise())); - if (this.verbose) { - console.log(`Creating crawl ${this.crawlId}`); - } this.queue = { pages: [], requests: [], @@ -144,6 +141,9 @@ class ClickhouseReporter extends BaseReporter { */ createCrawl(name = '', region = '') { this.ready.then(async () => { + if (this.verbose) { + console.log(`Creating crawl ${this.crawlId}`); + } await this.client.insert(`INSERT INTO ${DB}.crawls (crawlId, name, region)`, [{ crawlId: this.crawlId, name, From 3c5884e51136d5fc0d7106aed3cfc90c6f2357e5 Mon Sep 17 00:00:00 2001 From: Maxim Tsoy Date: Fri, 6 Dec 2024 11:16:29 -0500 Subject: [PATCH 07/10] Lint fix --- collectors/CMPCollector.js | 1 - 1 file changed, 1 deletion(-) diff --git a/collectors/CMPCollector.js b/collectors/CMPCollector.js index 1e7202d6..7bff83f1 100644 --- a/collectors/CMPCollector.js +++ b/collectors/CMPCollector.js @@ -1,6 +1,5 @@ /* eslint-disable max-lines */ const fs = require('fs'); -const createDeferred = require('../helpers/deferred'); const waitFor = require('../helpers/waitFor'); const BaseCollector = require('./BaseCollector'); From 0e9862c5c0c7e2a67f6ec7aec0dfd45ea5d0f316 Mon Sep 17 00:00:00 2001 From: Maxim Tsoy Date: Tue, 7 Jan 2025 16:51:12 +0100 Subject: [PATCH 08/10] Bump autoconsent --- package-lock.json | 14 +++++++------- package.json | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/package-lock.json b/package-lock.json index b5c065fe..258a8a78 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "1.0.0", "license": "Apache 2.0", "dependencies": { - "@duckduckgo/autoconsent": "^12.0.0", + "@duckduckgo/autoconsent": "^12.4.0", "async": "^2.6.1", "chalk": "^2.4.1", "clickhouse": "^2.6.0", @@ -40,9 +40,9 @@ } }, "node_modules/@duckduckgo/autoconsent": { - "version": "12.0.0", - "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-12.0.0.tgz", - "integrity": "sha512-ObPv0pE1d8G1Nnj9NtTxvu04mNRnJ8o8cpU1G0YJk1kP2ykipGnlp9wrn9Tkn2fDy5zT8HQqDsOYWEyk3ES1kg==", + "version": "12.4.0", + "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-12.4.0.tgz", + "integrity": "sha512-k7pNvq9IdPURoAhboAWx+xDMnIHKJ9JY74eft/aOOv1Lj5P8Bjv63ERyvttK5ugzvJvVyUR9GNp3DcQF/izlmA==", "license": "MPL-2.0", "dependencies": { "@ghostery/adblocker": "^2.0.4", @@ -3442,9 +3442,9 @@ }, "dependencies": { "@duckduckgo/autoconsent": { - "version": "12.0.0", - "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-12.0.0.tgz", - "integrity": "sha512-ObPv0pE1d8G1Nnj9NtTxvu04mNRnJ8o8cpU1G0YJk1kP2ykipGnlp9wrn9Tkn2fDy5zT8HQqDsOYWEyk3ES1kg==", + "version": "12.4.0", + "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-12.4.0.tgz", + "integrity": "sha512-k7pNvq9IdPURoAhboAWx+xDMnIHKJ9JY74eft/aOOv1Lj5P8Bjv63ERyvttK5ugzvJvVyUR9GNp3DcQF/izlmA==", "requires": { "@ghostery/adblocker": "^2.0.4", "@ghostery/adblocker-content": "^2.0.4", diff --git a/package.json b/package.json index be7afaab..d6db6051 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,7 @@ "typescript": "^4.6.4" }, "dependencies": { - "@duckduckgo/autoconsent": "^12.0.0", + "@duckduckgo/autoconsent": "^12.4.0", "async": "^2.6.1", "chalk": "^2.4.1", "clickhouse": "^2.6.0", From 353ba150d2a1e735d5073ad47aa0b59a41be6fbb Mon Sep 17 00:00:00 2001 From: Maxim Tsoy Date: Wed, 8 Jan 2025 21:45:46 +0100 Subject: [PATCH 09/10] Fix CMPCollector tests --- tests/collectors/CMPCollector.mocha.js | 90 ++++++++++++++------------ 1 file changed, 47 insertions(+), 43 deletions(-) diff --git a/tests/collectors/CMPCollector.mocha.js b/tests/collectors/CMPCollector.mocha.js index 104e4a96..9c82a52d 100644 --- a/tests/collectors/CMPCollector.mocha.js +++ b/tests/collectors/CMPCollector.mocha.js @@ -85,10 +85,12 @@ describe('CMPCollector', () => { assert.deepStrictEqual(commands[0], ['Runtime.evaluate', { expression: `autoconsentReceiveMessage({ type: "initResp", config: ${JSON.stringify({ enabled: true, - autoAction: null, + autoAction: 'optOut', disabledCmps: [], enablePrehide: false, enableCosmeticRules: true, + enableFilterList: true, + enableHeuristicDetection: true, detectRetries: 20, isMainWorld: false, })} })`, @@ -96,26 +98,6 @@ describe('CMPCollector', () => { }]); }); }); - describe('popupFound ', () => { - it('should trigger autoAction', async () => { - /** - * @type {ContentScriptMessage} - */ - const msg = { - type: 'popupFound', - url: 'some-url', - cmp: 'someCMP', - }; - commands.splice(0, commands.length); - collector.pendingScan.resolve(); - await collector.handleMessage(msg, 1111); - assert.strictEqual(commands.length, 1); - assert.deepStrictEqual(commands[0], ['Runtime.evaluate', { - expression: `autoconsentReceiveMessage({ type: "optOut" })`, - contextId: 1111, - }]); - }); - }); describe('optOutResult ', () => { it('should remember where to run self test', async () => { /** @@ -284,16 +266,41 @@ describe('CMPCollector', () => { const contentScriptEval = commands.find(cmd => cmd[0] === 'Runtime.evaluate')[1]; assert.strictEqual(contentScriptEval.contextId, 31337); - // @ts-ignore no need to provide all params - collector.context.pages = () => Promise.resolve([ - { - frames: () => [ - { - evaluate: () => Promise.resolve('This website is using cookies. We are using cookies! To reiterate, you consent to the use of cookies on this website. In fact, there is nothing you can possibly do.') - } - ] - } - ]); + const expectedPatterns = [ + "/we are using cookies/gi", + "/use of cookies/gi", + "/(this|our) (web)?site.*cookies/gi", + "/consent to.*cookies/gi", + ]; + const expectedSnippets = [ + 'We are using cookies', + 'use of cookies', + 'This website is using cookies. We are using cookies! To reiterate, you consent to the use of cookies', + 'consent to the use of cookies' + ]; + + bindingCalled.callback({ + name: 'cdpAutoconsentSendMessage', + payload: JSON.stringify({ + type: 'report', + url: 'some-url', + instanceId: 'xxxxxx', + mainFrame: true, + state: { + cosmeticFiltersOn: false, + filterListReported: false, + lifecycle: 'loading', + prehideOn: false, + findCmpAttempts: 0, + detectedCmps: [], + detectedPopups: [], + heuristicPatterns: expectedPatterns, + heuristicSnippets: expectedSnippets, + selfTest: null, + }, + }), + executionContextId: 31337, + }); await collector.postLoad(); const results = await collector.getData(); @@ -304,19 +311,10 @@ describe('CMPCollector', () => { started: false, succeeded: false, selfTestFail: false, + filterListMatched: false, errors: [], - patterns: [ - "/we are using cookies/gi", - "/use of cookies/gi", - "/(this|our) (web)?site.*cookies/gi", - "/consent to.*cookies/gi", - ], - snippets: [ - 'We are using cookies', - 'use of cookies', - 'This website is using cookies. We are using cookies! To reiterate, you consent to the use of cookies', - 'consent to the use of cookies' - ] + patterns: expectedPatterns, + snippets: expectedSnippets, }]); }); @@ -341,6 +339,7 @@ describe('CMPCollector', () => { open: false, started: false, succeeded: false, + filterListMatched: false, selfTestFail: false, errors: [], patterns: [], @@ -380,6 +379,7 @@ describe('CMPCollector', () => { started: false, succeeded: false, selfTestFail: false, + filterListMatched: false, errors: [], patterns: [], snippets: [], @@ -432,6 +432,7 @@ describe('CMPCollector', () => { started: true, succeeded: false, selfTestFail: false, + filterListMatched: false, errors: [], patterns: [], snippets: [], @@ -470,6 +471,7 @@ describe('CMPCollector', () => { started: true, succeeded: true, selfTestFail: false, + filterListMatched: false, errors: [], patterns: [], snippets: [], @@ -533,6 +535,7 @@ describe('CMPCollector', () => { started: true, succeeded: true, selfTestFail: false, + filterListMatched: false, errors: [], patterns: [], snippets: [], @@ -559,6 +562,7 @@ describe('CMPCollector', () => { open: true, started: true, succeeded: true, + filterListMatched: false, selfTestFail: true, errors: [], patterns: [], From d805d79f9056055bf38eedcda730fd2d6f7e4138 Mon Sep 17 00:00:00 2001 From: Maxim Tsoy Date: Wed, 8 Jan 2025 21:56:36 +0100 Subject: [PATCH 10/10] Mitigate linting errors --- crawlerConductor.js | 1 + package-lock.json | 23 +++++++++++++---------- package.json | 2 +- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/crawlerConductor.js b/crawlerConductor.js index a02ee37c..d1e6bf48 100644 --- a/crawlerConductor.js +++ b/crawlerConductor.js @@ -35,6 +35,7 @@ async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstPa */ const prefixedLog = (...msg) => log(chalk.gray(`${url.hostname}:`), ...msg); + // @ts-expect-error - outdated node types const data = await crawl(url, { log: prefixedLog, // @ts-ignore diff --git a/package-lock.json b/package-lock.json index 258a8a78..a6d8ebf5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -33,7 +33,7 @@ "mocha": "^10.0.0", "mockery": "^2.1.0", "pre-push": "^0.1.1", - "typescript": "^4.6.4" + "typescript": "^5.7.3" }, "engines": { "node": ">=14.0.0" @@ -282,7 +282,8 @@ "version": "10.17.60", "resolved": "https://registry.npmjs.org/@types/node/-/node-10.17.60.tgz", "integrity": "sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw==", - "devOptional": true + "devOptional": true, + "license": "MIT" }, "node_modules/@types/progress": { "version": "2.0.5", @@ -918,7 +919,8 @@ "node_modules/commander": { "version": "2.20.3", "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz", - "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==" + "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==", + "license": "MIT" }, "node_modules/concat-map": { "version": "0.0.1", @@ -3193,16 +3195,17 @@ "dev": true }, "node_modules/typescript": { - "version": "4.7.4", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.7.4.tgz", - "integrity": "sha512-C0WQT0gezHuw6AdY1M2jxUO83Rjf0HP7Sk1DtXj6j1EwkQNZrHAg2XPWlq62oqEhYvONq5pkC2Y9oPljWToLmQ==", + "version": "5.7.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.7.3.tgz", + "integrity": "sha512-84MVSjMEHP+FQRPy3pX9sTVV/INIex71s9TL2Gm5FG/WG1SqXeKyZ0k7/blY/4FdOzI12CBy1vGc4og/eus0fw==", "dev": true, + "license": "Apache-2.0", "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" }, "engines": { - "node": ">=4.2.0" + "node": ">=14.17" } }, "node_modules/unbzip2-stream": { @@ -5756,9 +5759,9 @@ "dev": true }, "typescript": { - "version": "4.7.4", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.7.4.tgz", - "integrity": "sha512-C0WQT0gezHuw6AdY1M2jxUO83Rjf0HP7Sk1DtXj6j1EwkQNZrHAg2XPWlq62oqEhYvONq5pkC2Y9oPljWToLmQ==", + "version": "5.7.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.7.3.tgz", + "integrity": "sha512-84MVSjMEHP+FQRPy3pX9sTVV/INIex71s9TL2Gm5FG/WG1SqXeKyZ0k7/blY/4FdOzI12CBy1vGc4og/eus0fw==", "dev": true }, "unbzip2-stream": { diff --git a/package.json b/package.json index d6db6051..d46b3c2b 100644 --- a/package.json +++ b/package.json @@ -38,7 +38,7 @@ "mocha": "^10.0.0", "mockery": "^2.1.0", "pre-push": "^0.1.1", - "typescript": "^4.6.4" + "typescript": "^5.7.3" }, "dependencies": { "@duckduckgo/autoconsent": "^12.4.0",